[llvm] b3b4727 - [X86] Replace (most) X86ISD::SHLD/SHRD usage with ISD::FSHL/FSHR generic opcodes (PR39467)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 11 04:33:27 PDT 2020


Author: Simon Pilgrim
Date: 2020-03-11T11:17:49Z
New Revision: b3b4727a3e7e170189e58ee8a6409112839a87b0

URL: https://github.com/llvm/llvm-project/commit/b3b4727a3e7e170189e58ee8a6409112839a87b0
DIFF: https://github.com/llvm/llvm-project/commit/b3b4727a3e7e170189e58ee8a6409112839a87b0.diff

LOG: [X86] Replace (most) X86ISD::SHLD/SHRD usage with ISD::FSHL/FSHR generic opcodes (PR39467)

For i32 and i64 cases, X86ISD::SHLD/SHRD are close enough to ISD::FSHL/FSHR that we can use them directly, we just need to account for the operand commutation for SHRD.

The i16 SHLD/SHRD case is annoying as the shift amount is modulo-32 (vs funnel shift modulo-16), so I've added X86ISD::FSHL/FSHR equivalents, which matches the generic implementation in all other terms.

Something I'm slightly concerned with is that ISD::FSHL/FSHR legality is controlled by the Subtarget.isSHLDSlow() feature flag - we don't normally use non-ISA features for this but it allows the DAG combines to continue to operate after legalization in a lot more cases.

The X86 *bits.ll changes are all affected by the same issue - we now have a "FSHR(-1,-1,amt) -> ROTR(-1,amt) -> (-1)" simplification that reduces the dependencies enough for the branch fall through code to mess up.

Differential Revision: https://reviews.llvm.org/D75748

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/lib/Target/X86/X86ISelLowering.h
    llvm/lib/Target/X86/X86InstrCompiler.td
    llvm/lib/Target/X86/X86InstrInfo.td
    llvm/lib/Target/X86/X86InstrShiftRotate.td
    llvm/test/CodeGen/X86/clear-highbits.ll
    llvm/test/CodeGen/X86/clear-lowbits.ll
    llvm/test/CodeGen/X86/extract-bits.ll
    llvm/test/CodeGen/X86/extract-lowbits.ll
    llvm/test/CodeGen/X86/fshl.ll
    llvm/test/CodeGen/X86/fshr.ll
    llvm/test/CodeGen/X86/shift-combine.ll
    llvm/test/CodeGen/X86/shift-parts.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a56dd0427127..4c86c87fb33b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -207,10 +207,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
   // Funnel shifts.
   for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
+    // For slow shld targets we only lower for code size.
+    LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
+
     setOperationAction(ShiftOp             , MVT::i16  , Custom);
-    setOperationAction(ShiftOp             , MVT::i32  , Custom);
+    setOperationAction(ShiftOp             , MVT::i32  , ShiftDoubleAction);
     if (Subtarget.is64Bit())
-      setOperationAction(ShiftOp           , MVT::i64  , Custom);
+      setOperationAction(ShiftOp           , MVT::i64  , ShiftDoubleAction);
   }
 
   if (!Subtarget.useSoftFloat()) {
@@ -18860,16 +18863,15 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
   if (!OptForSize && Subtarget.isSHLDSlow())
     return SDValue();
 
-  if (IsFSHR)
-    std::swap(Op0, Op1);
-
   // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
-  if (VT == MVT::i16)
+  if (VT == MVT::i16) {
     Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
                       DAG.getConstant(15, DL, Amt.getValueType()));
+    unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
+    return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
+  }
 
-  unsigned SHDOp = (IsFSHR ? X86ISD::SHRD : X86ISD::SHLD);
-  return DAG.getNode(SHDOp, DL, VT, Op0, Op1, Amt);
+  return Op;
 }
 
 // Try to use a packed vector operation to handle i64 on 32-bit targets when
@@ -29963,8 +29965,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
 #define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
   NODE_NAME_CASE(BSF)
   NODE_NAME_CASE(BSR)
-  NODE_NAME_CASE(SHLD)
-  NODE_NAME_CASE(SHRD)
+  NODE_NAME_CASE(FSHL)
+  NODE_NAME_CASE(FSHR)
   NODE_NAME_CASE(FAND)
   NODE_NAME_CASE(FANDN)
   NODE_NAME_CASE(FOR)

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index d102685e1cbe..0da22acc99de 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -33,10 +33,12 @@ namespace llvm {
       /// Bit scan reverse.
       BSR,
 
-      /// Double shift instructions. These correspond to
-      /// X86::SHLDxx and X86::SHRDxx instructions.
-      SHLD,
-      SHRD,
+      /// X86 funnel/double shift i16 instructions. These correspond to
+      /// X86::SHLDW and X86::SHRDW instructions which have 
diff erent amt
+      /// modulo rules to generic funnel shifts.
+      /// NOTE: The operand order matches ISD::FSHL/FSHR not SHLD/SHRD.
+      FSHL,
+      FSHR,
 
       /// Bitwise logical AND of floating point values. This corresponds
       /// to X86::ANDPS or X86::ANDPD.

diff  --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index fe1efd2fc097..0ad53c42550e 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1782,21 +1782,24 @@ multiclass MaskedRotateAmountPats<SDNode frag, string name> {
 defm : MaskedRotateAmountPats<rotl, "ROL">;
 defm : MaskedRotateAmountPats<rotr, "ROR">;
 
-// Double shift amount is implicitly masked.
-multiclass MaskedDoubleShiftAmountPats<SDNode frag, string name> {
-  // (shift x (and y, 31)) ==> (shift x, y)
-  def : Pat<(frag GR16:$src1, GR16:$src2, (shiftMask32 CL)),
-            (!cast<Instruction>(name # "16rrCL") GR16:$src1, GR16:$src2)>;
-  def : Pat<(frag GR32:$src1, GR32:$src2, (shiftMask32 CL)),
-            (!cast<Instruction>(name # "32rrCL") GR32:$src1, GR32:$src2)>;
-
-  // (shift x (and y, 63)) ==> (shift x, y)
-  def : Pat<(frag GR64:$src1, GR64:$src2, (shiftMask32 CL)),
-            (!cast<Instruction>(name # "64rrCL") GR64:$src1, GR64:$src2)>;
-}
-
-defm : MaskedDoubleShiftAmountPats<X86shld, "SHLD">;
-defm : MaskedDoubleShiftAmountPats<X86shrd, "SHRD">;
+// Double "funnel" shift amount is implicitly masked.
+// (fshl/fshr x (and y, 31)) ==> (fshl/fshr x, y) (NOTE: modulo32)
+def : Pat<(X86fshl GR16:$src1, GR16:$src2, (shiftMask32 CL)),
+          (SHLD16rrCL GR16:$src1, GR16:$src2)>;
+def : Pat<(X86fshr GR16:$src2, GR16:$src1, (shiftMask32 CL)),
+          (SHRD16rrCL GR16:$src1, GR16:$src2)>;
+
+// (fshl/fshr x (and y, 31)) ==> (fshl/fshr x, y)
+def : Pat<(fshl GR32:$src1, GR32:$src2, (shiftMask32 CL)),
+          (SHLD32rrCL GR32:$src1, GR32:$src2)>;
+def : Pat<(fshr GR32:$src2, GR32:$src1, (shiftMask32 CL)),
+          (SHRD32rrCL GR32:$src1, GR32:$src2)>;
+
+// (fshl/fshr x (and y, 63)) ==> (fshl/fshr x, y)
+def : Pat<(fshl GR64:$src1, GR64:$src2, (shiftMask64 CL)),
+          (SHLD64rrCL GR64:$src1, GR64:$src2)>;
+def : Pat<(fshr GR64:$src2, GR64:$src1, (shiftMask64 CL)),
+          (SHRD64rrCL GR64:$src1, GR64:$src2)>;
 
 let Predicates = [HasBMI2] in {
   let AddedComplexity = 1 in {

diff  --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index 8629cf728b45..173b5ef95238 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -143,8 +143,8 @@ def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
 
 def X86bsf     : SDNode<"X86ISD::BSF",      SDTUnaryArithWithFlags>;
 def X86bsr     : SDNode<"X86ISD::BSR",      SDTUnaryArithWithFlags>;
-def X86shld    : SDNode<"X86ISD::SHLD",     SDTIntShiftDOp>;
-def X86shrd    : SDNode<"X86ISD::SHRD",     SDTIntShiftDOp>;
+def X86fshl    : SDNode<"X86ISD::FSHL",     SDTIntShiftDOp>;
+def X86fshr    : SDNode<"X86ISD::FSHR",     SDTIntShiftDOp>;
 
 def X86cmp     : SDNode<"X86ISD::CMP" ,     SDTX86CmpTest>;
 def X86fcmp    : SDNode<"X86ISD::FCMP",     SDTX86FCmp>;

diff  --git a/llvm/lib/Target/X86/X86InstrShiftRotate.td b/llvm/lib/Target/X86/X86InstrShiftRotate.td
index 9d974b716dda..ddc273b1706d 100644
--- a/llvm/lib/Target/X86/X86InstrShiftRotate.td
+++ b/llvm/lib/Target/X86/X86InstrShiftRotate.td
@@ -661,32 +661,32 @@ let Uses = [CL], SchedRW = [WriteSHDrrcl] in {
 def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst),
                    (ins GR16:$src1, GR16:$src2),
                    "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                   [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))]>,
+                   [(set GR16:$dst, (X86fshl GR16:$src1, GR16:$src2, CL))]>,
                    TB, OpSize16;
 def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst),
                    (ins GR16:$src1, GR16:$src2),
                    "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                   [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))]>,
+                   [(set GR16:$dst, (X86fshr GR16:$src2, GR16:$src1, CL))]>,
                    TB, OpSize16;
 def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst),
                    (ins GR32:$src1, GR32:$src2),
                    "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                   [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))]>,
+                   [(set GR32:$dst, (fshl GR32:$src1, GR32:$src2, CL))]>,
                    TB, OpSize32;
 def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst),
                    (ins GR32:$src1, GR32:$src2),
                    "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                   [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))]>,
+                   [(set GR32:$dst, (fshr GR32:$src2, GR32:$src1, CL))]>,
                    TB, OpSize32;
 def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst),
                     (ins GR64:$src1, GR64:$src2),
                     "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                    [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))]>,
+                    [(set GR64:$dst, (fshl GR64:$src1, GR64:$src2, CL))]>,
                     TB;
 def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst),
                     (ins GR64:$src1, GR64:$src2),
                     "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                    [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))]>,
+                    [(set GR64:$dst, (fshr GR64:$src2, GR64:$src1, CL))]>,
                     TB;
 } // SchedRW
 
@@ -695,42 +695,42 @@ def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
                      (outs GR16:$dst),
                      (ins GR16:$src1, GR16:$src2, u8imm:$src3),
                      "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                     [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
+                     [(set GR16:$dst, (X86fshl GR16:$src1, GR16:$src2,
                                       (i8 imm:$src3)))]>,
                      TB, OpSize16;
 def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
                      (outs GR16:$dst),
                      (ins GR16:$src1, GR16:$src2, u8imm:$src3),
                      "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                     [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
+                     [(set GR16:$dst, (X86fshr GR16:$src2, GR16:$src1,
                                       (i8 imm:$src3)))]>,
                      TB, OpSize16;
 def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
                      (outs GR32:$dst),
                      (ins GR32:$src1, GR32:$src2, u8imm:$src3),
                      "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                     [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
+                     [(set GR32:$dst, (fshl GR32:$src1, GR32:$src2,
                                       (i8 imm:$src3)))]>,
                  TB, OpSize32;
 def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
                      (outs GR32:$dst),
                      (ins GR32:$src1, GR32:$src2, u8imm:$src3),
                      "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                     [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
+                     [(set GR32:$dst, (fshr GR32:$src2, GR32:$src1,
                                       (i8 imm:$src3)))]>,
                  TB, OpSize32;
 def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
                       (outs GR64:$dst),
                       (ins GR64:$src1, GR64:$src2, u8imm:$src3),
                       "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                      [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2,
+                      [(set GR64:$dst, (fshl GR64:$src1, GR64:$src2,
                                        (i8 imm:$src3)))]>,
                  TB;
 def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
                       (outs GR64:$dst),
                       (ins GR64:$src1, GR64:$src2, u8imm:$src3),
                       "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                      [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2,
+                      [(set GR64:$dst, (fshr GR64:$src2, GR64:$src1,
                                        (i8 imm:$src3)))]>,
                  TB;
 } // SchedRW
@@ -739,70 +739,70 @@ def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
 let Uses = [CL], SchedRW = [WriteSHDmrcl] in {
 def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
                    "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                   [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
-                     addr:$dst)]>, TB, OpSize16;
+                   [(store (X86fshl (loadi16 addr:$dst), GR16:$src2, CL),
+                                    addr:$dst)]>, TB, OpSize16;
 def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
                   "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                  [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
-                    addr:$dst)]>, TB, OpSize16;
+                  [(store (X86fshr GR16:$src2, (loadi16 addr:$dst), CL),
+                                   addr:$dst)]>, TB, OpSize16;
 
 def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
                    "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                   [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
+                   [(store (fshl (loadi32 addr:$dst), GR32:$src2, CL),
                      addr:$dst)]>, TB, OpSize32;
 def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
                   "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                  [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
-                    addr:$dst)]>, TB, OpSize32;
+                  [(store (fshr GR32:$src2, (loadi32 addr:$dst), CL),
+                                addr:$dst)]>, TB, OpSize32;
 
 def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
                     "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                    [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL),
-                      addr:$dst)]>, TB;
+                    [(store (fshl (loadi64 addr:$dst), GR64:$src2, CL),
+                                  addr:$dst)]>, TB;
 def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
                     "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                    [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL),
-                      addr:$dst)]>, TB;
+                    [(store (fshr GR64:$src2, (loadi64 addr:$dst), CL),
+                                  addr:$dst)]>, TB;
 } // SchedRW
 
 let SchedRW = [WriteSHDmri] in {
 def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
                     (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
                     "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                    [(store (X86shld (loadi16 addr:$dst), GR16:$src2,
-                                      (i8 imm:$src3)), addr:$dst)]>,
+                    [(store (X86fshl (loadi16 addr:$dst), GR16:$src2,
+                                     (i8 imm:$src3)), addr:$dst)]>,
                     TB, OpSize16;
 def SHRD16mri8 : Ii8<0xAC, MRMDestMem,
                      (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
                      "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                    [(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
-                                      (i8 imm:$src3)), addr:$dst)]>,
+                    [(store (X86fshr GR16:$src2, (loadi16 addr:$dst),
+                                     (i8 imm:$src3)), addr:$dst)]>,
                      TB, OpSize16;
 
 def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
                     (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
                     "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                    [(store (X86shld (loadi32 addr:$dst), GR32:$src2,
-                                      (i8 imm:$src3)), addr:$dst)]>,
+                    [(store (fshl (loadi32 addr:$dst), GR32:$src2,
+                                  (i8 imm:$src3)), addr:$dst)]>,
                     TB, OpSize32;
 def SHRD32mri8 : Ii8<0xAC, MRMDestMem,
                      (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
                      "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                     [(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
-                                       (i8 imm:$src3)), addr:$dst)]>,
+                     [(store (fshr GR32:$src2, (loadi32 addr:$dst),
+                                   (i8 imm:$src3)), addr:$dst)]>,
                      TB, OpSize32;
 
 def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
                       (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
                       "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                      [(store (X86shld (loadi64 addr:$dst), GR64:$src2,
-                                       (i8 imm:$src3)), addr:$dst)]>,
+                      [(store (fshl (loadi64 addr:$dst), GR64:$src2,
+                                    (i8 imm:$src3)), addr:$dst)]>,
                  TB;
 def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
                       (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
                       "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                      [(store (X86shrd (loadi64 addr:$dst), GR64:$src2,
-                                       (i8 imm:$src3)), addr:$dst)]>,
+                      [(store (fshr GR64:$src2, (loadi64 addr:$dst),
+                                    (i8 imm:$src3)), addr:$dst)]>,
                  TB;
 } // SchedRW
 

diff  --git a/llvm/test/CodeGen/X86/clear-highbits.ll b/llvm/test/CodeGen/X86/clear-highbits.ll
index 10e96c1b1ca3..be34c5de550a 100644
--- a/llvm/test/CodeGen/X86/clear-highbits.ll
+++ b/llvm/test/CodeGen/X86/clear-highbits.ll
@@ -513,33 +513,36 @@ define i32 @clear_highbits32_c4_commutative(i32 %val, i32 %numhighbits) nounwind
 define i64 @clear_highbits64_c0(i64 %val, i64 %numhighbits) nounwind {
 ; X86-FALLBACK0-LABEL: clear_highbits64_c0:
 ; X86-FALLBACK0:       # %bb.0:
+; X86-FALLBACK0-NEXT:    pushl %esi
 ; X86-FALLBACK0-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-FALLBACK0-NEXT:    movl $-1, %eax
-; X86-FALLBACK0-NEXT:    movl $-1, %edx
-; X86-FALLBACK0-NEXT:    shrl %cl, %edx
-; X86-FALLBACK0-NEXT:    shrdl %cl, %eax, %eax
-; X86-FALLBACK0-NEXT:    testb $32, %cl
-; X86-FALLBACK0-NEXT:    je .LBB13_2
-; X86-FALLBACK0-NEXT:  # %bb.1:
-; X86-FALLBACK0-NEXT:    movl %edx, %eax
+; X86-FALLBACK0-NEXT:    movl $-1, %esi
+; X86-FALLBACK0-NEXT:    shrl %cl, %esi
 ; X86-FALLBACK0-NEXT:    xorl %edx, %edx
-; X86-FALLBACK0-NEXT:  .LBB13_2:
+; X86-FALLBACK0-NEXT:    testb $32, %cl
+; X86-FALLBACK0-NEXT:    jne .LBB13_1
+; X86-FALLBACK0-NEXT:  # %bb.2:
+; X86-FALLBACK0-NEXT:    movl %esi, %edx
+; X86-FALLBACK0-NEXT:    jmp .LBB13_3
+; X86-FALLBACK0-NEXT:  .LBB13_1:
+; X86-FALLBACK0-NEXT:    movl %esi, %eax
+; X86-FALLBACK0-NEXT:  .LBB13_3:
 ; X86-FALLBACK0-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-FALLBACK0-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-FALLBACK0-NEXT:    popl %esi
 ; X86-FALLBACK0-NEXT:    retl
 ;
 ; X86-FALLBACK1-LABEL: clear_highbits64_c0:
 ; X86-FALLBACK1:       # %bb.0:
 ; X86-FALLBACK1-NEXT:    pushl %esi
 ; X86-FALLBACK1-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-FALLBACK1-NEXT:    movl $-1, %eax
 ; X86-FALLBACK1-NEXT:    movl $-1, %esi
-; X86-FALLBACK1-NEXT:    shrl %cl, %esi
-; X86-FALLBACK1-NEXT:    shrdl %cl, %eax, %eax
+; X86-FALLBACK1-NEXT:    movl $-1, %eax
+; X86-FALLBACK1-NEXT:    shrl %cl, %eax
 ; X86-FALLBACK1-NEXT:    xorl %edx, %edx
 ; X86-FALLBACK1-NEXT:    testb $32, %cl
-; X86-FALLBACK1-NEXT:    cmovnel %esi, %eax
-; X86-FALLBACK1-NEXT:    cmovel %esi, %edx
+; X86-FALLBACK1-NEXT:    cmovel %eax, %edx
+; X86-FALLBACK1-NEXT:    cmovel %esi, %eax
 ; X86-FALLBACK1-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-FALLBACK1-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-FALLBACK1-NEXT:    popl %esi
@@ -549,14 +552,13 @@ define i64 @clear_highbits64_c0(i64 %val, i64 %numhighbits) nounwind {
 ; X86-FALLBACK2:       # %bb.0:
 ; X86-FALLBACK2-NEXT:    pushl %esi
 ; X86-FALLBACK2-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-FALLBACK2-NEXT:    movl $-1, %eax
 ; X86-FALLBACK2-NEXT:    movl $-1, %esi
-; X86-FALLBACK2-NEXT:    shrl %cl, %esi
-; X86-FALLBACK2-NEXT:    shrdl %cl, %eax, %eax
+; X86-FALLBACK2-NEXT:    movl $-1, %eax
+; X86-FALLBACK2-NEXT:    shrl %cl, %eax
 ; X86-FALLBACK2-NEXT:    xorl %edx, %edx
 ; X86-FALLBACK2-NEXT:    testb $32, %cl
-; X86-FALLBACK2-NEXT:    cmovnel %esi, %eax
-; X86-FALLBACK2-NEXT:    cmovel %esi, %edx
+; X86-FALLBACK2-NEXT:    cmovel %eax, %edx
+; X86-FALLBACK2-NEXT:    cmovel %esi, %eax
 ; X86-FALLBACK2-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-FALLBACK2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-FALLBACK2-NEXT:    popl %esi
@@ -568,11 +570,10 @@ define i64 @clear_highbits64_c0(i64 %val, i64 %numhighbits) nounwind {
 ; X86-BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI2-NEXT:    movl $-1, %eax
 ; X86-BMI2-NEXT:    shrxl %ecx, %eax, %esi
-; X86-BMI2-NEXT:    shrdl %cl, %eax, %eax
 ; X86-BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI2-NEXT:    testb $32, %cl
-; X86-BMI2-NEXT:    cmovnel %esi, %eax
 ; X86-BMI2-NEXT:    cmovel %esi, %edx
+; X86-BMI2-NEXT:    cmovnel %esi, %eax
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    popl %esi
@@ -600,33 +601,36 @@ define i64 @clear_highbits64_c0(i64 %val, i64 %numhighbits) nounwind {
 define i64 @clear_highbits64_c1_indexzext(i64 %val, i8 %numhighbits) nounwind {
 ; X86-FALLBACK0-LABEL: clear_highbits64_c1_indexzext:
 ; X86-FALLBACK0:       # %bb.0:
+; X86-FALLBACK0-NEXT:    pushl %esi
 ; X86-FALLBACK0-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-FALLBACK0-NEXT:    movl $-1, %eax
-; X86-FALLBACK0-NEXT:    movl $-1, %edx
-; X86-FALLBACK0-NEXT:    shrl %cl, %edx
-; X86-FALLBACK0-NEXT:    shrdl %cl, %eax, %eax
-; X86-FALLBACK0-NEXT:    testb $32, %cl
-; X86-FALLBACK0-NEXT:    je .LBB14_2
-; X86-FALLBACK0-NEXT:  # %bb.1:
-; X86-FALLBACK0-NEXT:    movl %edx, %eax
+; X86-FALLBACK0-NEXT:    movl $-1, %esi
+; X86-FALLBACK0-NEXT:    shrl %cl, %esi
 ; X86-FALLBACK0-NEXT:    xorl %edx, %edx
-; X86-FALLBACK0-NEXT:  .LBB14_2:
+; X86-FALLBACK0-NEXT:    testb $32, %cl
+; X86-FALLBACK0-NEXT:    jne .LBB14_1
+; X86-FALLBACK0-NEXT:  # %bb.2:
+; X86-FALLBACK0-NEXT:    movl %esi, %edx
+; X86-FALLBACK0-NEXT:    jmp .LBB14_3
+; X86-FALLBACK0-NEXT:  .LBB14_1:
+; X86-FALLBACK0-NEXT:    movl %esi, %eax
+; X86-FALLBACK0-NEXT:  .LBB14_3:
 ; X86-FALLBACK0-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-FALLBACK0-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-FALLBACK0-NEXT:    popl %esi
 ; X86-FALLBACK0-NEXT:    retl
 ;
 ; X86-FALLBACK1-LABEL: clear_highbits64_c1_indexzext:
 ; X86-FALLBACK1:       # %bb.0:
 ; X86-FALLBACK1-NEXT:    pushl %esi
 ; X86-FALLBACK1-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-FALLBACK1-NEXT:    movl $-1, %eax
 ; X86-FALLBACK1-NEXT:    movl $-1, %esi
-; X86-FALLBACK1-NEXT:    shrl %cl, %esi
-; X86-FALLBACK1-NEXT:    shrdl %cl, %eax, %eax
+; X86-FALLBACK1-NEXT:    movl $-1, %eax
+; X86-FALLBACK1-NEXT:    shrl %cl, %eax
 ; X86-FALLBACK1-NEXT:    xorl %edx, %edx
 ; X86-FALLBACK1-NEXT:    testb $32, %cl
-; X86-FALLBACK1-NEXT:    cmovnel %esi, %eax
-; X86-FALLBACK1-NEXT:    cmovel %esi, %edx
+; X86-FALLBACK1-NEXT:    cmovel %eax, %edx
+; X86-FALLBACK1-NEXT:    cmovel %esi, %eax
 ; X86-FALLBACK1-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-FALLBACK1-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-FALLBACK1-NEXT:    popl %esi
@@ -636,14 +640,13 @@ define i64 @clear_highbits64_c1_indexzext(i64 %val, i8 %numhighbits) nounwind {
 ; X86-FALLBACK2:       # %bb.0:
 ; X86-FALLBACK2-NEXT:    pushl %esi
 ; X86-FALLBACK2-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-FALLBACK2-NEXT:    movl $-1, %eax
 ; X86-FALLBACK2-NEXT:    movl $-1, %esi
-; X86-FALLBACK2-NEXT:    shrl %cl, %esi
-; X86-FALLBACK2-NEXT:    shrdl %cl, %eax, %eax
+; X86-FALLBACK2-NEXT:    movl $-1, %eax
+; X86-FALLBACK2-NEXT:    shrl %cl, %eax
 ; X86-FALLBACK2-NEXT:    xorl %edx, %edx
 ; X86-FALLBACK2-NEXT:    testb $32, %cl
-; X86-FALLBACK2-NEXT:    cmovnel %esi, %eax
-; X86-FALLBACK2-NEXT:    cmovel %esi, %edx
+; X86-FALLBACK2-NEXT:    cmovel %eax, %edx
+; X86-FALLBACK2-NEXT:    cmovel %esi, %eax
 ; X86-FALLBACK2-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-FALLBACK2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-FALLBACK2-NEXT:    popl %esi
@@ -655,11 +658,10 @@ define i64 @clear_highbits64_c1_indexzext(i64 %val, i8 %numhighbits) nounwind {
 ; X86-BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI2-NEXT:    movl $-1, %eax
 ; X86-BMI2-NEXT:    shrxl %ecx, %eax, %esi
-; X86-BMI2-NEXT:    shrdl %cl, %eax, %eax
 ; X86-BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI2-NEXT:    testb $32, %cl
-; X86-BMI2-NEXT:    cmovnel %esi, %eax
 ; X86-BMI2-NEXT:    cmovel %esi, %edx
+; X86-BMI2-NEXT:    cmovnel %esi, %eax
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    popl %esi
@@ -689,22 +691,26 @@ define i64 @clear_highbits64_c1_indexzext(i64 %val, i8 %numhighbits) nounwind {
 define i64 @clear_highbits64_c2_load(i64* %w, i64 %numhighbits) nounwind {
 ; X86-FALLBACK0-LABEL: clear_highbits64_c2_load:
 ; X86-FALLBACK0:       # %bb.0:
+; X86-FALLBACK0-NEXT:    pushl %edi
 ; X86-FALLBACK0-NEXT:    pushl %esi
 ; X86-FALLBACK0-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-FALLBACK0-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-FALLBACK0-NEXT:    movl $-1, %eax
-; X86-FALLBACK0-NEXT:    movl $-1, %edx
-; X86-FALLBACK0-NEXT:    shrl %cl, %edx
-; X86-FALLBACK0-NEXT:    shrdl %cl, %eax, %eax
-; X86-FALLBACK0-NEXT:    testb $32, %cl
-; X86-FALLBACK0-NEXT:    je .LBB15_2
-; X86-FALLBACK0-NEXT:  # %bb.1:
-; X86-FALLBACK0-NEXT:    movl %edx, %eax
+; X86-FALLBACK0-NEXT:    movl $-1, %edi
+; X86-FALLBACK0-NEXT:    shrl %cl, %edi
 ; X86-FALLBACK0-NEXT:    xorl %edx, %edx
-; X86-FALLBACK0-NEXT:  .LBB15_2:
+; X86-FALLBACK0-NEXT:    testb $32, %cl
+; X86-FALLBACK0-NEXT:    jne .LBB15_1
+; X86-FALLBACK0-NEXT:  # %bb.2:
+; X86-FALLBACK0-NEXT:    movl %edi, %edx
+; X86-FALLBACK0-NEXT:    jmp .LBB15_3
+; X86-FALLBACK0-NEXT:  .LBB15_1:
+; X86-FALLBACK0-NEXT:    movl %edi, %eax
+; X86-FALLBACK0-NEXT:  .LBB15_3:
 ; X86-FALLBACK0-NEXT:    andl (%esi), %eax
 ; X86-FALLBACK0-NEXT:    andl 4(%esi), %edx
 ; X86-FALLBACK0-NEXT:    popl %esi
+; X86-FALLBACK0-NEXT:    popl %edi
 ; X86-FALLBACK0-NEXT:    retl
 ;
 ; X86-FALLBACK1-LABEL: clear_highbits64_c2_load:
@@ -713,14 +719,13 @@ define i64 @clear_highbits64_c2_load(i64* %w, i64 %numhighbits) nounwind {
 ; X86-FALLBACK1-NEXT:    pushl %esi
 ; X86-FALLBACK1-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-FALLBACK1-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-FALLBACK1-NEXT:    movl $-1, %eax
 ; X86-FALLBACK1-NEXT:    movl $-1, %edi
-; X86-FALLBACK1-NEXT:    shrl %cl, %edi
-; X86-FALLBACK1-NEXT:    shrdl %cl, %eax, %eax
+; X86-FALLBACK1-NEXT:    movl $-1, %eax
+; X86-FALLBACK1-NEXT:    shrl %cl, %eax
 ; X86-FALLBACK1-NEXT:    xorl %edx, %edx
 ; X86-FALLBACK1-NEXT:    testb $32, %cl
-; X86-FALLBACK1-NEXT:    cmovnel %edi, %eax
-; X86-FALLBACK1-NEXT:    cmovel %edi, %edx
+; X86-FALLBACK1-NEXT:    cmovel %eax, %edx
+; X86-FALLBACK1-NEXT:    cmovel %edi, %eax
 ; X86-FALLBACK1-NEXT:    andl (%esi), %eax
 ; X86-FALLBACK1-NEXT:    andl 4(%esi), %edx
 ; X86-FALLBACK1-NEXT:    popl %esi
@@ -733,14 +738,13 @@ define i64 @clear_highbits64_c2_load(i64* %w, i64 %numhighbits) nounwind {
 ; X86-FALLBACK2-NEXT:    pushl %esi
 ; X86-FALLBACK2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-FALLBACK2-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-FALLBACK2-NEXT:    movl $-1, %eax
 ; X86-FALLBACK2-NEXT:    movl $-1, %edi
-; X86-FALLBACK2-NEXT:    shrl %cl, %edi
-; X86-FALLBACK2-NEXT:    shrdl %cl, %eax, %eax
+; X86-FALLBACK2-NEXT:    movl $-1, %eax
+; X86-FALLBACK2-NEXT:    shrl %cl, %eax
 ; X86-FALLBACK2-NEXT:    xorl %edx, %edx
 ; X86-FALLBACK2-NEXT:    testb $32, %cl
-; X86-FALLBACK2-NEXT:    cmovnel %edi, %eax
-; X86-FALLBACK2-NEXT:    cmovel %edi, %edx
+; X86-FALLBACK2-NEXT:    cmovel %eax, %edx
+; X86-FALLBACK2-NEXT:    cmovel %edi, %eax
 ; X86-FALLBACK2-NEXT:    andl (%esi), %eax
 ; X86-FALLBACK2-NEXT:    andl 4(%esi), %edx
 ; X86-FALLBACK2-NEXT:    popl %esi
@@ -749,21 +753,20 @@ define i64 @clear_highbits64_c2_load(i64* %w, i64 %numhighbits) nounwind {
 ;
 ; X86-BMI2-LABEL: clear_highbits64_c2_load:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %edi
+; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-BMI2-NEXT:    movl $-1, %eax
-; X86-BMI2-NEXT:    shrxl %ecx, %eax, %edi
-; X86-BMI2-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI2-NEXT:    shrxl %ebx, %eax, %esi
 ; X86-BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI2-NEXT:    testb $32, %cl
-; X86-BMI2-NEXT:    cmovnel %edi, %eax
-; X86-BMI2-NEXT:    cmovel %edi, %edx
-; X86-BMI2-NEXT:    andl (%esi), %eax
-; X86-BMI2-NEXT:    andl 4(%esi), %edx
+; X86-BMI2-NEXT:    testb $32, %bl
+; X86-BMI2-NEXT:    cmovel %esi, %edx
+; X86-BMI2-NEXT:    cmovnel %esi, %eax
+; X86-BMI2-NEXT:    andl (%ecx), %eax
+; X86-BMI2-NEXT:    andl 4(%ecx), %edx
 ; X86-BMI2-NEXT:    popl %esi
-; X86-BMI2-NEXT:    popl %edi
+; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: clear_highbits64_c2_load:
@@ -789,22 +792,26 @@ define i64 @clear_highbits64_c2_load(i64* %w, i64 %numhighbits) nounwind {
 define i64 @clear_highbits64_c3_load_indexzext(i64* %w, i8 %numhighbits) nounwind {
 ; X86-FALLBACK0-LABEL: clear_highbits64_c3_load_indexzext:
 ; X86-FALLBACK0:       # %bb.0:
+; X86-FALLBACK0-NEXT:    pushl %edi
 ; X86-FALLBACK0-NEXT:    pushl %esi
 ; X86-FALLBACK0-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-FALLBACK0-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-FALLBACK0-NEXT:    movl $-1, %eax
-; X86-FALLBACK0-NEXT:    movl $-1, %edx
-; X86-FALLBACK0-NEXT:    shrl %cl, %edx
-; X86-FALLBACK0-NEXT:    shrdl %cl, %eax, %eax
-; X86-FALLBACK0-NEXT:    testb $32, %cl
-; X86-FALLBACK0-NEXT:    je .LBB16_2
-; X86-FALLBACK0-NEXT:  # %bb.1:
-; X86-FALLBACK0-NEXT:    movl %edx, %eax
+; X86-FALLBACK0-NEXT:    movl $-1, %edi
+; X86-FALLBACK0-NEXT:    shrl %cl, %edi
 ; X86-FALLBACK0-NEXT:    xorl %edx, %edx
-; X86-FALLBACK0-NEXT:  .LBB16_2:
+; X86-FALLBACK0-NEXT:    testb $32, %cl
+; X86-FALLBACK0-NEXT:    jne .LBB16_1
+; X86-FALLBACK0-NEXT:  # %bb.2:
+; X86-FALLBACK0-NEXT:    movl %edi, %edx
+; X86-FALLBACK0-NEXT:    jmp .LBB16_3
+; X86-FALLBACK0-NEXT:  .LBB16_1:
+; X86-FALLBACK0-NEXT:    movl %edi, %eax
+; X86-FALLBACK0-NEXT:  .LBB16_3:
 ; X86-FALLBACK0-NEXT:    andl (%esi), %eax
 ; X86-FALLBACK0-NEXT:    andl 4(%esi), %edx
 ; X86-FALLBACK0-NEXT:    popl %esi
+; X86-FALLBACK0-NEXT:    popl %edi
 ; X86-FALLBACK0-NEXT:    retl
 ;
 ; X86-FALLBACK1-LABEL: clear_highbits64_c3_load_indexzext:
@@ -813,14 +820,13 @@ define i64 @clear_highbits64_c3_load_indexzext(i64* %w, i8 %numhighbits) nounwin
 ; X86-FALLBACK1-NEXT:    pushl %esi
 ; X86-FALLBACK1-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-FALLBACK1-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-FALLBACK1-NEXT:    movl $-1, %eax
 ; X86-FALLBACK1-NEXT:    movl $-1, %edi
-; X86-FALLBACK1-NEXT:    shrl %cl, %edi
-; X86-FALLBACK1-NEXT:    shrdl %cl, %eax, %eax
+; X86-FALLBACK1-NEXT:    movl $-1, %eax
+; X86-FALLBACK1-NEXT:    shrl %cl, %eax
 ; X86-FALLBACK1-NEXT:    xorl %edx, %edx
 ; X86-FALLBACK1-NEXT:    testb $32, %cl
-; X86-FALLBACK1-NEXT:    cmovnel %edi, %eax
-; X86-FALLBACK1-NEXT:    cmovel %edi, %edx
+; X86-FALLBACK1-NEXT:    cmovel %eax, %edx
+; X86-FALLBACK1-NEXT:    cmovel %edi, %eax
 ; X86-FALLBACK1-NEXT:    andl (%esi), %eax
 ; X86-FALLBACK1-NEXT:    andl 4(%esi), %edx
 ; X86-FALLBACK1-NEXT:    popl %esi
@@ -833,14 +839,13 @@ define i64 @clear_highbits64_c3_load_indexzext(i64* %w, i8 %numhighbits) nounwin
 ; X86-FALLBACK2-NEXT:    pushl %esi
 ; X86-FALLBACK2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-FALLBACK2-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-FALLBACK2-NEXT:    movl $-1, %eax
 ; X86-FALLBACK2-NEXT:    movl $-1, %edi
-; X86-FALLBACK2-NEXT:    shrl %cl, %edi
-; X86-FALLBACK2-NEXT:    shrdl %cl, %eax, %eax
+; X86-FALLBACK2-NEXT:    movl $-1, %eax
+; X86-FALLBACK2-NEXT:    shrl %cl, %eax
 ; X86-FALLBACK2-NEXT:    xorl %edx, %edx
 ; X86-FALLBACK2-NEXT:    testb $32, %cl
-; X86-FALLBACK2-NEXT:    cmovnel %edi, %eax
-; X86-FALLBACK2-NEXT:    cmovel %edi, %edx
+; X86-FALLBACK2-NEXT:    cmovel %eax, %edx
+; X86-FALLBACK2-NEXT:    cmovel %edi, %eax
 ; X86-FALLBACK2-NEXT:    andl (%esi), %eax
 ; X86-FALLBACK2-NEXT:    andl 4(%esi), %edx
 ; X86-FALLBACK2-NEXT:    popl %esi
@@ -849,21 +854,20 @@ define i64 @clear_highbits64_c3_load_indexzext(i64* %w, i8 %numhighbits) nounwin
 ;
 ; X86-BMI2-LABEL: clear_highbits64_c3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %edi
+; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-BMI2-NEXT:    movl $-1, %eax
-; X86-BMI2-NEXT:    shrxl %ecx, %eax, %edi
-; X86-BMI2-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI2-NEXT:    shrxl %ebx, %eax, %esi
 ; X86-BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI2-NEXT:    testb $32, %cl
-; X86-BMI2-NEXT:    cmovnel %edi, %eax
-; X86-BMI2-NEXT:    cmovel %edi, %edx
-; X86-BMI2-NEXT:    andl (%esi), %eax
-; X86-BMI2-NEXT:    andl 4(%esi), %edx
+; X86-BMI2-NEXT:    testb $32, %bl
+; X86-BMI2-NEXT:    cmovel %esi, %edx
+; X86-BMI2-NEXT:    cmovnel %esi, %eax
+; X86-BMI2-NEXT:    andl (%ecx), %eax
+; X86-BMI2-NEXT:    andl 4(%ecx), %edx
 ; X86-BMI2-NEXT:    popl %esi
-; X86-BMI2-NEXT:    popl %edi
+; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: clear_highbits64_c3_load_indexzext:
@@ -891,33 +895,36 @@ define i64 @clear_highbits64_c3_load_indexzext(i64* %w, i8 %numhighbits) nounwin
 define i64 @clear_highbits64_c4_commutative(i64 %val, i64 %numhighbits) nounwind {
 ; X86-FALLBACK0-LABEL: clear_highbits64_c4_commutative:
 ; X86-FALLBACK0:       # %bb.0:
+; X86-FALLBACK0-NEXT:    pushl %esi
 ; X86-FALLBACK0-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-FALLBACK0-NEXT:    movl $-1, %eax
-; X86-FALLBACK0-NEXT:    movl $-1, %edx
-; X86-FALLBACK0-NEXT:    shrl %cl, %edx
-; X86-FALLBACK0-NEXT:    shrdl %cl, %eax, %eax
-; X86-FALLBACK0-NEXT:    testb $32, %cl
-; X86-FALLBACK0-NEXT:    je .LBB17_2
-; X86-FALLBACK0-NEXT:  # %bb.1:
-; X86-FALLBACK0-NEXT:    movl %edx, %eax
+; X86-FALLBACK0-NEXT:    movl $-1, %esi
+; X86-FALLBACK0-NEXT:    shrl %cl, %esi
 ; X86-FALLBACK0-NEXT:    xorl %edx, %edx
-; X86-FALLBACK0-NEXT:  .LBB17_2:
+; X86-FALLBACK0-NEXT:    testb $32, %cl
+; X86-FALLBACK0-NEXT:    jne .LBB17_1
+; X86-FALLBACK0-NEXT:  # %bb.2:
+; X86-FALLBACK0-NEXT:    movl %esi, %edx
+; X86-FALLBACK0-NEXT:    jmp .LBB17_3
+; X86-FALLBACK0-NEXT:  .LBB17_1:
+; X86-FALLBACK0-NEXT:    movl %esi, %eax
+; X86-FALLBACK0-NEXT:  .LBB17_3:
 ; X86-FALLBACK0-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-FALLBACK0-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-FALLBACK0-NEXT:    popl %esi
 ; X86-FALLBACK0-NEXT:    retl
 ;
 ; X86-FALLBACK1-LABEL: clear_highbits64_c4_commutative:
 ; X86-FALLBACK1:       # %bb.0:
 ; X86-FALLBACK1-NEXT:    pushl %esi
 ; X86-FALLBACK1-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-FALLBACK1-NEXT:    movl $-1, %eax
 ; X86-FALLBACK1-NEXT:    movl $-1, %esi
-; X86-FALLBACK1-NEXT:    shrl %cl, %esi
-; X86-FALLBACK1-NEXT:    shrdl %cl, %eax, %eax
+; X86-FALLBACK1-NEXT:    movl $-1, %eax
+; X86-FALLBACK1-NEXT:    shrl %cl, %eax
 ; X86-FALLBACK1-NEXT:    xorl %edx, %edx
 ; X86-FALLBACK1-NEXT:    testb $32, %cl
-; X86-FALLBACK1-NEXT:    cmovnel %esi, %eax
-; X86-FALLBACK1-NEXT:    cmovel %esi, %edx
+; X86-FALLBACK1-NEXT:    cmovel %eax, %edx
+; X86-FALLBACK1-NEXT:    cmovel %esi, %eax
 ; X86-FALLBACK1-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-FALLBACK1-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-FALLBACK1-NEXT:    popl %esi
@@ -927,14 +934,13 @@ define i64 @clear_highbits64_c4_commutative(i64 %val, i64 %numhighbits) nounwind
 ; X86-FALLBACK2:       # %bb.0:
 ; X86-FALLBACK2-NEXT:    pushl %esi
 ; X86-FALLBACK2-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-FALLBACK2-NEXT:    movl $-1, %eax
 ; X86-FALLBACK2-NEXT:    movl $-1, %esi
-; X86-FALLBACK2-NEXT:    shrl %cl, %esi
-; X86-FALLBACK2-NEXT:    shrdl %cl, %eax, %eax
+; X86-FALLBACK2-NEXT:    movl $-1, %eax
+; X86-FALLBACK2-NEXT:    shrl %cl, %eax
 ; X86-FALLBACK2-NEXT:    xorl %edx, %edx
 ; X86-FALLBACK2-NEXT:    testb $32, %cl
-; X86-FALLBACK2-NEXT:    cmovnel %esi, %eax
-; X86-FALLBACK2-NEXT:    cmovel %esi, %edx
+; X86-FALLBACK2-NEXT:    cmovel %eax, %edx
+; X86-FALLBACK2-NEXT:    cmovel %esi, %eax
 ; X86-FALLBACK2-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-FALLBACK2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-FALLBACK2-NEXT:    popl %esi
@@ -946,11 +952,10 @@ define i64 @clear_highbits64_c4_commutative(i64 %val, i64 %numhighbits) nounwind
 ; X86-BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI2-NEXT:    movl $-1, %eax
 ; X86-BMI2-NEXT:    shrxl %ecx, %eax, %esi
-; X86-BMI2-NEXT:    shrdl %cl, %eax, %eax
 ; X86-BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI2-NEXT:    testb $32, %cl
-; X86-BMI2-NEXT:    cmovnel %esi, %eax
 ; X86-BMI2-NEXT:    cmovel %esi, %edx
+; X86-BMI2-NEXT:    cmovnel %esi, %eax
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    popl %esi
@@ -1064,7 +1069,6 @@ define i64 @oneuse64(i64 %val, i64 %numhighbits) nounwind {
 ; X86-FALLBACK0-NEXT:    movl $-1, %esi
 ; X86-FALLBACK0-NEXT:    movl $-1, %edi
 ; X86-FALLBACK0-NEXT:    shrl %cl, %edi
-; X86-FALLBACK0-NEXT:    shrdl %cl, %esi, %esi
 ; X86-FALLBACK0-NEXT:    testb $32, %cl
 ; X86-FALLBACK0-NEXT:    je .LBB19_2
 ; X86-FALLBACK0-NEXT:  # %bb.1:
@@ -1094,7 +1098,6 @@ define i64 @oneuse64(i64 %val, i64 %numhighbits) nounwind {
 ; X86-FALLBACK1-NEXT:    movl $-1, %esi
 ; X86-FALLBACK1-NEXT:    movl $-1, %eax
 ; X86-FALLBACK1-NEXT:    shrl %cl, %eax
-; X86-FALLBACK1-NEXT:    shrdl %cl, %esi, %esi
 ; X86-FALLBACK1-NEXT:    xorl %edi, %edi
 ; X86-FALLBACK1-NEXT:    testb $32, %cl
 ; X86-FALLBACK1-NEXT:    cmovnel %eax, %esi
@@ -1122,7 +1125,6 @@ define i64 @oneuse64(i64 %val, i64 %numhighbits) nounwind {
 ; X86-FALLBACK2-NEXT:    movl $-1, %esi
 ; X86-FALLBACK2-NEXT:    movl $-1, %eax
 ; X86-FALLBACK2-NEXT:    shrl %cl, %eax
-; X86-FALLBACK2-NEXT:    shrdl %cl, %esi, %esi
 ; X86-FALLBACK2-NEXT:    xorl %edi, %edi
 ; X86-FALLBACK2-NEXT:    testb $32, %cl
 ; X86-FALLBACK2-NEXT:    cmovnel %eax, %esi
@@ -1146,14 +1148,13 @@ define i64 @oneuse64(i64 %val, i64 %numhighbits) nounwind {
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
 ; X86-BMI2-NEXT:    pushl %eax
-; X86-BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI2-NEXT:    movl $-1, %esi
-; X86-BMI2-NEXT:    shrxl %ecx, %esi, %eax
-; X86-BMI2-NEXT:    shrdl %cl, %esi, %esi
+; X86-BMI2-NEXT:    shrxl %eax, %esi, %ecx
 ; X86-BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI2-NEXT:    testb $32, %cl
-; X86-BMI2-NEXT:    cmovnel %eax, %esi
-; X86-BMI2-NEXT:    cmovel %eax, %edi
+; X86-BMI2-NEXT:    testb $32, %al
+; X86-BMI2-NEXT:    cmovnel %ecx, %esi
+; X86-BMI2-NEXT:    cmovel %ecx, %edi
 ; X86-BMI2-NEXT:    subl $8, %esp
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi

diff  --git a/llvm/test/CodeGen/X86/clear-lowbits.ll b/llvm/test/CodeGen/X86/clear-lowbits.ll
index be251eb99f2e..a8f2ecde03d9 100644
--- a/llvm/test/CodeGen/X86/clear-lowbits.ll
+++ b/llvm/test/CodeGen/X86/clear-lowbits.ll
@@ -502,15 +502,14 @@ define i64 @clear_lowbits64_c0(i64 %val, i64 %numlowbits) nounwind {
 ; X86-NOBMI2-NEXT:    movl $-1, %edx
 ; X86-NOBMI2-NEXT:    movl $-1, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
-; X86-NOBMI2-NEXT:    shldl %cl, %edx, %edx
 ; X86-NOBMI2-NEXT:    testb $32, %cl
 ; X86-NOBMI2-NEXT:    je .LBB13_2
 ; X86-NOBMI2-NEXT:  # %bb.1:
 ; X86-NOBMI2-NEXT:    movl %eax, %edx
 ; X86-NOBMI2-NEXT:    xorl %eax, %eax
 ; X86-NOBMI2-NEXT:  .LBB13_2:
-; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: clear_lowbits64_c0:
@@ -518,15 +517,14 @@ define i64 @clear_lowbits64_c0(i64 %val, i64 %numlowbits) nounwind {
 ; X86-BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI2-NEXT:    movl $-1, %edx
 ; X86-BMI2-NEXT:    shlxl %ecx, %edx, %eax
-; X86-BMI2-NEXT:    shldl %cl, %edx, %edx
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB13_2
 ; X86-BMI2-NEXT:  # %bb.1:
 ; X86-BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB13_2:
-; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: clear_lowbits64_c0:
@@ -555,15 +553,14 @@ define i64 @clear_lowbits64_c1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 ; X86-NOBMI2-NEXT:    movl $-1, %edx
 ; X86-NOBMI2-NEXT:    movl $-1, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
-; X86-NOBMI2-NEXT:    shldl %cl, %edx, %edx
 ; X86-NOBMI2-NEXT:    testb $32, %cl
 ; X86-NOBMI2-NEXT:    je .LBB14_2
 ; X86-NOBMI2-NEXT:  # %bb.1:
 ; X86-NOBMI2-NEXT:    movl %eax, %edx
 ; X86-NOBMI2-NEXT:    xorl %eax, %eax
 ; X86-NOBMI2-NEXT:  .LBB14_2:
-; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: clear_lowbits64_c1_indexzext:
@@ -571,15 +568,14 @@ define i64 @clear_lowbits64_c1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 ; X86-BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI2-NEXT:    movl $-1, %edx
 ; X86-BMI2-NEXT:    shlxl %ecx, %edx, %eax
-; X86-BMI2-NEXT:    shldl %cl, %edx, %edx
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB14_2
 ; X86-BMI2-NEXT:  # %bb.1:
 ; X86-BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB14_2:
-; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: clear_lowbits64_c1_indexzext:
@@ -612,35 +608,33 @@ define i64 @clear_lowbits64_c2_load(i64* %w, i64 %numlowbits) nounwind {
 ; X86-NOBMI2-NEXT:    movl $-1, %edx
 ; X86-NOBMI2-NEXT:    movl $-1, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
-; X86-NOBMI2-NEXT:    shldl %cl, %edx, %edx
 ; X86-NOBMI2-NEXT:    testb $32, %cl
 ; X86-NOBMI2-NEXT:    je .LBB15_2
 ; X86-NOBMI2-NEXT:  # %bb.1:
 ; X86-NOBMI2-NEXT:    movl %eax, %edx
 ; X86-NOBMI2-NEXT:    xorl %eax, %eax
 ; X86-NOBMI2-NEXT:  .LBB15_2:
-; X86-NOBMI2-NEXT:    andl 4(%esi), %edx
 ; X86-NOBMI2-NEXT:    andl (%esi), %eax
+; X86-NOBMI2-NEXT:    andl 4(%esi), %edx
 ; X86-NOBMI2-NEXT:    popl %esi
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: clear_lowbits64_c2_load:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI2-NEXT:    pushl %ebx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-BMI2-NEXT:    movl $-1, %edx
-; X86-BMI2-NEXT:    shlxl %ecx, %edx, %eax
-; X86-BMI2-NEXT:    shldl %cl, %edx, %edx
-; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    shlxl %ebx, %edx, %eax
+; X86-BMI2-NEXT:    testb $32, %bl
 ; X86-BMI2-NEXT:    je .LBB15_2
 ; X86-BMI2-NEXT:  # %bb.1:
 ; X86-BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB15_2:
-; X86-BMI2-NEXT:    andl 4(%esi), %edx
-; X86-BMI2-NEXT:    andl (%esi), %eax
-; X86-BMI2-NEXT:    popl %esi
+; X86-BMI2-NEXT:    andl (%ecx), %eax
+; X86-BMI2-NEXT:    andl 4(%ecx), %edx
+; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: clear_lowbits64_c2_load:
@@ -672,35 +666,33 @@ define i64 @clear_lowbits64_c3_load_indexzext(i64* %w, i8 %numlowbits) nounwind
 ; X86-NOBMI2-NEXT:    movl $-1, %edx
 ; X86-NOBMI2-NEXT:    movl $-1, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
-; X86-NOBMI2-NEXT:    shldl %cl, %edx, %edx
 ; X86-NOBMI2-NEXT:    testb $32, %cl
 ; X86-NOBMI2-NEXT:    je .LBB16_2
 ; X86-NOBMI2-NEXT:  # %bb.1:
 ; X86-NOBMI2-NEXT:    movl %eax, %edx
 ; X86-NOBMI2-NEXT:    xorl %eax, %eax
 ; X86-NOBMI2-NEXT:  .LBB16_2:
-; X86-NOBMI2-NEXT:    andl 4(%esi), %edx
 ; X86-NOBMI2-NEXT:    andl (%esi), %eax
+; X86-NOBMI2-NEXT:    andl 4(%esi), %edx
 ; X86-NOBMI2-NEXT:    popl %esi
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: clear_lowbits64_c3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI2-NEXT:    pushl %ebx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-BMI2-NEXT:    movl $-1, %edx
-; X86-BMI2-NEXT:    shlxl %ecx, %edx, %eax
-; X86-BMI2-NEXT:    shldl %cl, %edx, %edx
-; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    shlxl %ebx, %edx, %eax
+; X86-BMI2-NEXT:    testb $32, %bl
 ; X86-BMI2-NEXT:    je .LBB16_2
 ; X86-BMI2-NEXT:  # %bb.1:
 ; X86-BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB16_2:
-; X86-BMI2-NEXT:    andl 4(%esi), %edx
-; X86-BMI2-NEXT:    andl (%esi), %eax
-; X86-BMI2-NEXT:    popl %esi
+; X86-BMI2-NEXT:    andl (%ecx), %eax
+; X86-BMI2-NEXT:    andl 4(%ecx), %edx
+; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: clear_lowbits64_c3_load_indexzext:
@@ -732,15 +724,14 @@ define i64 @clear_lowbits64_c4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ; X86-NOBMI2-NEXT:    movl $-1, %edx
 ; X86-NOBMI2-NEXT:    movl $-1, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
-; X86-NOBMI2-NEXT:    shldl %cl, %edx, %edx
 ; X86-NOBMI2-NEXT:    testb $32, %cl
 ; X86-NOBMI2-NEXT:    je .LBB17_2
 ; X86-NOBMI2-NEXT:  # %bb.1:
 ; X86-NOBMI2-NEXT:    movl %eax, %edx
 ; X86-NOBMI2-NEXT:    xorl %eax, %eax
 ; X86-NOBMI2-NEXT:  .LBB17_2:
-; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: clear_lowbits64_c4_commutative:
@@ -748,15 +739,14 @@ define i64 @clear_lowbits64_c4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ; X86-BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI2-NEXT:    movl $-1, %edx
 ; X86-BMI2-NEXT:    shlxl %ecx, %edx, %eax
-; X86-BMI2-NEXT:    shldl %cl, %edx, %edx
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB17_2
 ; X86-BMI2-NEXT:  # %bb.1:
 ; X86-BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB17_2:
-; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: clear_lowbits64_c4_commutative:
@@ -1325,15 +1315,14 @@ define i64 @clear_lowbits64_ic0(i64 %val, i64 %numlowbits) nounwind {
 ; X86-NOBMI2-NEXT:    movl $-1, %edx
 ; X86-NOBMI2-NEXT:    movl $-1, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
-; X86-NOBMI2-NEXT:    shldl %cl, %edx, %edx
 ; X86-NOBMI2-NEXT:    testb $32, %cl
 ; X86-NOBMI2-NEXT:    je .LBB31_2
 ; X86-NOBMI2-NEXT:  # %bb.1:
 ; X86-NOBMI2-NEXT:    movl %eax, %edx
 ; X86-NOBMI2-NEXT:    xorl %eax, %eax
 ; X86-NOBMI2-NEXT:  .LBB31_2:
-; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: clear_lowbits64_ic0:
@@ -1342,15 +1331,14 @@ define i64 @clear_lowbits64_ic0(i64 %val, i64 %numlowbits) nounwind {
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI2-NEXT:    movl $-1, %edx
 ; X86-BMI2-NEXT:    shlxl %ecx, %edx, %eax
-; X86-BMI2-NEXT:    shldl %cl, %edx, %edx
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB31_2
 ; X86-BMI2-NEXT:  # %bb.1:
 ; X86-BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB31_2:
-; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: clear_lowbits64_ic0:
@@ -1383,15 +1371,14 @@ define i64 @clear_lowbits64_ic1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 ; X86-NOBMI2-NEXT:    movl $-1, %edx
 ; X86-NOBMI2-NEXT:    movl $-1, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
-; X86-NOBMI2-NEXT:    shldl %cl, %edx, %edx
 ; X86-NOBMI2-NEXT:    testb $32, %cl
 ; X86-NOBMI2-NEXT:    je .LBB32_2
 ; X86-NOBMI2-NEXT:  # %bb.1:
 ; X86-NOBMI2-NEXT:    movl %eax, %edx
 ; X86-NOBMI2-NEXT:    xorl %eax, %eax
 ; X86-NOBMI2-NEXT:  .LBB32_2:
-; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: clear_lowbits64_ic1_indexzext:
@@ -1400,15 +1387,14 @@ define i64 @clear_lowbits64_ic1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI2-NEXT:    movl $-1, %edx
 ; X86-BMI2-NEXT:    shlxl %ecx, %edx, %eax
-; X86-BMI2-NEXT:    shldl %cl, %edx, %edx
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB32_2
 ; X86-BMI2-NEXT:  # %bb.1:
 ; X86-BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB32_2:
-; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: clear_lowbits64_ic1_indexzext:
@@ -1445,36 +1431,34 @@ define i64 @clear_lowbits64_ic2_load(i64* %w, i64 %numlowbits) nounwind {
 ; X86-NOBMI2-NEXT:    movl $-1, %edx
 ; X86-NOBMI2-NEXT:    movl $-1, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
-; X86-NOBMI2-NEXT:    shldl %cl, %edx, %edx
 ; X86-NOBMI2-NEXT:    testb $32, %cl
 ; X86-NOBMI2-NEXT:    je .LBB33_2
 ; X86-NOBMI2-NEXT:  # %bb.1:
 ; X86-NOBMI2-NEXT:    movl %eax, %edx
 ; X86-NOBMI2-NEXT:    xorl %eax, %eax
 ; X86-NOBMI2-NEXT:  .LBB33_2:
-; X86-NOBMI2-NEXT:    andl 4(%esi), %edx
 ; X86-NOBMI2-NEXT:    andl (%esi), %eax
+; X86-NOBMI2-NEXT:    andl 4(%esi), %edx
 ; X86-NOBMI2-NEXT:    popl %esi
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: clear_lowbits64_ic2_load:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    movb $64, %cl
-; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI2-NEXT:    pushl %ebx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movb $64, %bl
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %bl
 ; X86-BMI2-NEXT:    movl $-1, %edx
-; X86-BMI2-NEXT:    shlxl %ecx, %edx, %eax
-; X86-BMI2-NEXT:    shldl %cl, %edx, %edx
-; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    shlxl %ebx, %edx, %eax
+; X86-BMI2-NEXT:    testb $32, %bl
 ; X86-BMI2-NEXT:    je .LBB33_2
 ; X86-BMI2-NEXT:  # %bb.1:
 ; X86-BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB33_2:
-; X86-BMI2-NEXT:    andl 4(%esi), %edx
-; X86-BMI2-NEXT:    andl (%esi), %eax
-; X86-BMI2-NEXT:    popl %esi
+; X86-BMI2-NEXT:    andl (%ecx), %eax
+; X86-BMI2-NEXT:    andl 4(%ecx), %edx
+; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: clear_lowbits64_ic2_load:
@@ -1510,36 +1494,34 @@ define i64 @clear_lowbits64_ic3_load_indexzext(i64* %w, i8 %numlowbits) nounwind
 ; X86-NOBMI2-NEXT:    movl $-1, %edx
 ; X86-NOBMI2-NEXT:    movl $-1, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
-; X86-NOBMI2-NEXT:    shldl %cl, %edx, %edx
 ; X86-NOBMI2-NEXT:    testb $32, %cl
 ; X86-NOBMI2-NEXT:    je .LBB34_2
 ; X86-NOBMI2-NEXT:  # %bb.1:
 ; X86-NOBMI2-NEXT:    movl %eax, %edx
 ; X86-NOBMI2-NEXT:    xorl %eax, %eax
 ; X86-NOBMI2-NEXT:  .LBB34_2:
-; X86-NOBMI2-NEXT:    andl 4(%esi), %edx
 ; X86-NOBMI2-NEXT:    andl (%esi), %eax
+; X86-NOBMI2-NEXT:    andl 4(%esi), %edx
 ; X86-NOBMI2-NEXT:    popl %esi
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: clear_lowbits64_ic3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    movb $64, %cl
-; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI2-NEXT:    pushl %ebx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movb $64, %bl
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %bl
 ; X86-BMI2-NEXT:    movl $-1, %edx
-; X86-BMI2-NEXT:    shlxl %ecx, %edx, %eax
-; X86-BMI2-NEXT:    shldl %cl, %edx, %edx
-; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    shlxl %ebx, %edx, %eax
+; X86-BMI2-NEXT:    testb $32, %bl
 ; X86-BMI2-NEXT:    je .LBB34_2
 ; X86-BMI2-NEXT:  # %bb.1:
 ; X86-BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB34_2:
-; X86-BMI2-NEXT:    andl 4(%esi), %edx
-; X86-BMI2-NEXT:    andl (%esi), %eax
-; X86-BMI2-NEXT:    popl %esi
+; X86-BMI2-NEXT:    andl (%ecx), %eax
+; X86-BMI2-NEXT:    andl 4(%ecx), %edx
+; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: clear_lowbits64_ic3_load_indexzext:
@@ -1575,15 +1557,14 @@ define i64 @clear_lowbits64_ic4_commutative(i64 %val, i64 %numlowbits) nounwind
 ; X86-NOBMI2-NEXT:    movl $-1, %edx
 ; X86-NOBMI2-NEXT:    movl $-1, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
-; X86-NOBMI2-NEXT:    shldl %cl, %edx, %edx
 ; X86-NOBMI2-NEXT:    testb $32, %cl
 ; X86-NOBMI2-NEXT:    je .LBB35_2
 ; X86-NOBMI2-NEXT:  # %bb.1:
 ; X86-NOBMI2-NEXT:    movl %eax, %edx
 ; X86-NOBMI2-NEXT:    xorl %eax, %eax
 ; X86-NOBMI2-NEXT:  .LBB35_2:
-; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: clear_lowbits64_ic4_commutative:
@@ -1592,15 +1573,14 @@ define i64 @clear_lowbits64_ic4_commutative(i64 %val, i64 %numlowbits) nounwind
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI2-NEXT:    movl $-1, %edx
 ; X86-BMI2-NEXT:    shlxl %ecx, %edx, %eax
-; X86-BMI2-NEXT:    shldl %cl, %edx, %edx
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB35_2
 ; X86-BMI2-NEXT:  # %bb.1:
 ; X86-BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB35_2:
-; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: clear_lowbits64_ic4_commutative:
@@ -1712,22 +1692,24 @@ define i64 @oneuse64(i64 %val, i64 %numlowbits) nounwind {
 ; X86-NOBMI2-NEXT:    pushl %eax
 ; X86-NOBMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI2-NEXT:    movl $-1, %esi
-; X86-NOBMI2-NEXT:    movl $-1, %edi
-; X86-NOBMI2-NEXT:    shll %cl, %edi
-; X86-NOBMI2-NEXT:    shldl %cl, %esi, %esi
-; X86-NOBMI2-NEXT:    testb $32, %cl
-; X86-NOBMI2-NEXT:    je .LBB37_2
-; X86-NOBMI2-NEXT:  # %bb.1:
-; X86-NOBMI2-NEXT:    movl %edi, %esi
+; X86-NOBMI2-NEXT:    movl $-1, %eax
+; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    xorl %edi, %edi
-; X86-NOBMI2-NEXT:  .LBB37_2:
+; X86-NOBMI2-NEXT:    testb $32, %cl
+; X86-NOBMI2-NEXT:    jne .LBB37_1
+; X86-NOBMI2-NEXT:  # %bb.2:
+; X86-NOBMI2-NEXT:    movl %eax, %edi
+; X86-NOBMI2-NEXT:    jmp .LBB37_3
+; X86-NOBMI2-NEXT:  .LBB37_1:
+; X86-NOBMI2-NEXT:    movl %eax, %esi
+; X86-NOBMI2-NEXT:  .LBB37_3:
 ; X86-NOBMI2-NEXT:    subl $8, %esp
 ; X86-NOBMI2-NEXT:    pushl %esi
 ; X86-NOBMI2-NEXT:    pushl %edi
 ; X86-NOBMI2-NEXT:    calll use64
 ; X86-NOBMI2-NEXT:    addl $16, %esp
-; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI2-NEXT:    movl %edi, %eax
 ; X86-NOBMI2-NEXT:    movl %esi, %edx
 ; X86-NOBMI2-NEXT:    addl $4, %esp
@@ -1742,21 +1724,23 @@ define i64 @oneuse64(i64 %val, i64 %numlowbits) nounwind {
 ; X86-BMI2-NEXT:    pushl %eax
 ; X86-BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI2-NEXT:    movl $-1, %esi
-; X86-BMI2-NEXT:    shlxl %ecx, %esi, %edi
-; X86-BMI2-NEXT:    shldl %cl, %esi, %esi
-; X86-BMI2-NEXT:    testb $32, %cl
-; X86-BMI2-NEXT:    je .LBB37_2
-; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %edi, %esi
+; X86-BMI2-NEXT:    shlxl %ecx, %esi, %eax
 ; X86-BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI2-NEXT:  .LBB37_2:
+; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    jne .LBB37_1
+; X86-BMI2-NEXT:  # %bb.2:
+; X86-BMI2-NEXT:    movl %eax, %edi
+; X86-BMI2-NEXT:    jmp .LBB37_3
+; X86-BMI2-NEXT:  .LBB37_1:
+; X86-BMI2-NEXT:    movl %eax, %esi
+; X86-BMI2-NEXT:  .LBB37_3:
 ; X86-BMI2-NEXT:    subl $8, %esp
 ; X86-BMI2-NEXT:    pushl %esi
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    calll use64
 ; X86-BMI2-NEXT:    addl $16, %esp
-; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %esi
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %esi
 ; X86-BMI2-NEXT:    movl %edi, %eax
 ; X86-BMI2-NEXT:    movl %esi, %edx
 ; X86-BMI2-NEXT:    addl $4, %esp

diff  --git a/llvm/test/CodeGen/X86/extract-bits.ll b/llvm/test/CodeGen/X86/extract-bits.ll
index edb603f87d0e..9825b828eddf 100644
--- a/llvm/test/CodeGen/X86/extract-bits.ll
+++ b/llvm/test/CodeGen/X86/extract-bits.ll
@@ -2661,6 +2661,7 @@ define i32 @bextr32_b5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_b0:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
@@ -2670,6 +2671,7 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    movl %eax, %edi
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
+; X86-NOBMI-NEXT:    xorl %eax, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB25_2
 ; X86-NOBMI-NEXT:  # %bb.1:
@@ -2677,22 +2679,24 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB25_2:
 ; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    movl $-1, %eax
+; X86-NOBMI-NEXT:    movl $-1, %ebx
 ; X86-NOBMI-NEXT:    movb %ch, %cl
-; X86-NOBMI-NEXT:    shll %cl, %eax
-; X86-NOBMI-NEXT:    shldl %cl, %edx, %edx
+; X86-NOBMI-NEXT:    shll %cl, %ebx
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB25_4
-; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %eax, %edx
-; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB25_4:
+; X86-NOBMI-NEXT:    jne .LBB25_3
+; X86-NOBMI-NEXT:  # %bb.4:
+; X86-NOBMI-NEXT:    movl %ebx, %eax
+; X86-NOBMI-NEXT:    jmp .LBB25_5
+; X86-NOBMI-NEXT:  .LBB25_3:
+; X86-NOBMI-NEXT:    movl %ebx, %edx
+; X86-NOBMI-NEXT:  .LBB25_5:
 ; X86-NOBMI-NEXT:    notl %edx
 ; X86-NOBMI-NEXT:    andl %edi, %edx
 ; X86-NOBMI-NEXT:    notl %eax
 ; X86-NOBMI-NEXT:    andl %esi, %eax
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr64_b0:
@@ -2717,7 +2721,6 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %ebx
-; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %al
 ; X86-BMI1NOTBM-NEXT:    je .LBB25_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
@@ -2734,34 +2737,30 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1BMI2-LABEL: bextr64_b0:
 ; X86-BMI1BMI2:       # %bb.0:
 ; X86-BMI1BMI2-NEXT:    pushl %ebx
-; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %esi
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edx, %edx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB25_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %edx, %esi
+; X86-BMI1BMI2-NEXT:    movl %edx, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:  .LBB25_2:
-; X86-BMI1BMI2-NEXT:    movl $-1, %edi
-; X86-BMI1BMI2-NEXT:    shlxl %eax, %edi, %ebx
-; X86-BMI1BMI2-NEXT:    movl %eax, %ecx
-; X86-BMI1BMI2-NEXT:    shldl %cl, %edi, %edi
-; X86-BMI1BMI2-NEXT:    testb $32, %al
+; X86-BMI1BMI2-NEXT:    movl $-1, %esi
+; X86-BMI1BMI2-NEXT:    shlxl %ebx, %esi, %ecx
+; X86-BMI1BMI2-NEXT:    testb $32, %bl
 ; X86-BMI1BMI2-NEXT:    je .LBB25_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
-; X86-BMI1BMI2-NEXT:    movl %ebx, %edi
-; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
+; X86-BMI1BMI2-NEXT:    movl %ecx, %esi
+; X86-BMI1BMI2-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1BMI2-NEXT:  .LBB25_4:
-; X86-BMI1BMI2-NEXT:    andnl %edx, %edi, %edx
-; X86-BMI1BMI2-NEXT:    andnl %esi, %ebx, %eax
+; X86-BMI1BMI2-NEXT:    andnl %edx, %esi, %edx
+; X86-BMI1BMI2-NEXT:    andnl %eax, %ecx, %eax
 ; X86-BMI1BMI2-NEXT:    popl %esi
-; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
@@ -2800,6 +2799,7 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_b1_indexzext:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
@@ -2809,6 +2809,7 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X86-NOBMI-NEXT:    movl %eax, %edi
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
+; X86-NOBMI-NEXT:    xorl %eax, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB26_2
 ; X86-NOBMI-NEXT:  # %bb.1:
@@ -2816,22 +2817,24 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB26_2:
 ; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    movl $-1, %eax
+; X86-NOBMI-NEXT:    movl $-1, %ebx
 ; X86-NOBMI-NEXT:    movb %ch, %cl
-; X86-NOBMI-NEXT:    shll %cl, %eax
-; X86-NOBMI-NEXT:    shldl %cl, %edx, %edx
+; X86-NOBMI-NEXT:    shll %cl, %ebx
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB26_4
-; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %eax, %edx
-; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB26_4:
+; X86-NOBMI-NEXT:    jne .LBB26_3
+; X86-NOBMI-NEXT:  # %bb.4:
+; X86-NOBMI-NEXT:    movl %ebx, %eax
+; X86-NOBMI-NEXT:    jmp .LBB26_5
+; X86-NOBMI-NEXT:  .LBB26_3:
+; X86-NOBMI-NEXT:    movl %ebx, %edx
+; X86-NOBMI-NEXT:  .LBB26_5:
 ; X86-NOBMI-NEXT:    notl %edx
 ; X86-NOBMI-NEXT:    andl %edi, %edx
 ; X86-NOBMI-NEXT:    notl %eax
 ; X86-NOBMI-NEXT:    andl %esi, %eax
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr64_b1_indexzext:
@@ -2856,7 +2859,6 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %ebx
-; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %al
 ; X86-BMI1NOTBM-NEXT:    je .LBB26_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
@@ -2873,34 +2875,30 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X86-BMI1BMI2-LABEL: bextr64_b1_indexzext:
 ; X86-BMI1BMI2:       # %bb.0:
 ; X86-BMI1BMI2-NEXT:    pushl %ebx
-; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %esi
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edx, %edx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB26_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %edx, %esi
+; X86-BMI1BMI2-NEXT:    movl %edx, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:  .LBB26_2:
-; X86-BMI1BMI2-NEXT:    movl $-1, %edi
-; X86-BMI1BMI2-NEXT:    shlxl %eax, %edi, %ebx
-; X86-BMI1BMI2-NEXT:    movl %eax, %ecx
-; X86-BMI1BMI2-NEXT:    shldl %cl, %edi, %edi
-; X86-BMI1BMI2-NEXT:    testb $32, %al
+; X86-BMI1BMI2-NEXT:    movl $-1, %esi
+; X86-BMI1BMI2-NEXT:    shlxl %ebx, %esi, %ecx
+; X86-BMI1BMI2-NEXT:    testb $32, %bl
 ; X86-BMI1BMI2-NEXT:    je .LBB26_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
-; X86-BMI1BMI2-NEXT:    movl %ebx, %edi
-; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
+; X86-BMI1BMI2-NEXT:    movl %ecx, %esi
+; X86-BMI1BMI2-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1BMI2-NEXT:  .LBB26_4:
-; X86-BMI1BMI2-NEXT:    andnl %edx, %edi, %edx
-; X86-BMI1BMI2-NEXT:    andnl %esi, %ebx, %eax
+; X86-BMI1BMI2-NEXT:    andnl %edx, %esi, %edx
+; X86-BMI1BMI2-NEXT:    andnl %eax, %ecx, %eax
 ; X86-BMI1BMI2-NEXT:    popl %esi
-; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
@@ -2943,6 +2941,7 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 define i64 @bextr64_b2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_b2_load:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
@@ -2953,6 +2952,7 @@ define i64 @bextr64_b2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-NOBMI-NEXT:    movl %eax, %edi
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
+; X86-NOBMI-NEXT:    xorl %eax, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB27_2
 ; X86-NOBMI-NEXT:  # %bb.1:
@@ -2960,22 +2960,24 @@ define i64 @bextr64_b2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB27_2:
 ; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    movl $-1, %eax
+; X86-NOBMI-NEXT:    movl $-1, %ebx
 ; X86-NOBMI-NEXT:    movb %ch, %cl
-; X86-NOBMI-NEXT:    shll %cl, %eax
-; X86-NOBMI-NEXT:    shldl %cl, %edx, %edx
+; X86-NOBMI-NEXT:    shll %cl, %ebx
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB27_4
-; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %eax, %edx
-; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB27_4:
+; X86-NOBMI-NEXT:    jne .LBB27_3
+; X86-NOBMI-NEXT:  # %bb.4:
+; X86-NOBMI-NEXT:    movl %ebx, %eax
+; X86-NOBMI-NEXT:    jmp .LBB27_5
+; X86-NOBMI-NEXT:  .LBB27_3:
+; X86-NOBMI-NEXT:    movl %ebx, %edx
+; X86-NOBMI-NEXT:  .LBB27_5:
 ; X86-NOBMI-NEXT:    notl %edx
 ; X86-NOBMI-NEXT:    andl %edi, %edx
 ; X86-NOBMI-NEXT:    notl %eax
 ; X86-NOBMI-NEXT:    andl %esi, %eax
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr64_b2_load:
@@ -3001,7 +3003,6 @@ define i64 @bextr64_b2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %ebx
-; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %al
 ; X86-BMI1NOTBM-NEXT:    je .LBB27_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
@@ -3018,35 +3019,31 @@ define i64 @bextr64_b2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1BMI2-LABEL: bextr64_b2_load:
 ; X86-BMI1BMI2:       # %bb.0:
 ; X86-BMI1BMI2-NEXT:    pushl %ebx
-; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1BMI2-NEXT:    movl (%edx), %esi
-; X86-BMI1BMI2-NEXT:    movl 4(%edx), %edi
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edi, %edx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %edi, %esi
+; X86-BMI1BMI2-NEXT:    movl (%edx), %eax
+; X86-BMI1BMI2-NEXT:    movl 4(%edx), %esi
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %edx
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %eax
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB27_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %edx, %esi
+; X86-BMI1BMI2-NEXT:    movl %edx, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:  .LBB27_2:
-; X86-BMI1BMI2-NEXT:    movl $-1, %edi
-; X86-BMI1BMI2-NEXT:    shlxl %eax, %edi, %ebx
-; X86-BMI1BMI2-NEXT:    movl %eax, %ecx
-; X86-BMI1BMI2-NEXT:    shldl %cl, %edi, %edi
-; X86-BMI1BMI2-NEXT:    testb $32, %al
+; X86-BMI1BMI2-NEXT:    movl $-1, %esi
+; X86-BMI1BMI2-NEXT:    shlxl %ebx, %esi, %ecx
+; X86-BMI1BMI2-NEXT:    testb $32, %bl
 ; X86-BMI1BMI2-NEXT:    je .LBB27_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
-; X86-BMI1BMI2-NEXT:    movl %ebx, %edi
-; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
+; X86-BMI1BMI2-NEXT:    movl %ecx, %esi
+; X86-BMI1BMI2-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1BMI2-NEXT:  .LBB27_4:
-; X86-BMI1BMI2-NEXT:    andnl %edx, %edi, %edx
-; X86-BMI1BMI2-NEXT:    andnl %esi, %ebx, %eax
+; X86-BMI1BMI2-NEXT:    andnl %edx, %esi, %edx
+; X86-BMI1BMI2-NEXT:    andnl %eax, %ecx, %eax
 ; X86-BMI1BMI2-NEXT:    popl %esi
-; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
@@ -3087,6 +3084,7 @@ define i64 @bextr64_b2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 define i64 @bextr64_b3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_b3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
@@ -3097,6 +3095,7 @@ define i64 @bextr64_b3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe
 ; X86-NOBMI-NEXT:    movl %eax, %edi
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
+; X86-NOBMI-NEXT:    xorl %eax, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB28_2
 ; X86-NOBMI-NEXT:  # %bb.1:
@@ -3104,22 +3103,24 @@ define i64 @bextr64_b3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB28_2:
 ; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    movl $-1, %eax
+; X86-NOBMI-NEXT:    movl $-1, %ebx
 ; X86-NOBMI-NEXT:    movb %ch, %cl
-; X86-NOBMI-NEXT:    shll %cl, %eax
-; X86-NOBMI-NEXT:    shldl %cl, %edx, %edx
+; X86-NOBMI-NEXT:    shll %cl, %ebx
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB28_4
-; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %eax, %edx
-; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB28_4:
+; X86-NOBMI-NEXT:    jne .LBB28_3
+; X86-NOBMI-NEXT:  # %bb.4:
+; X86-NOBMI-NEXT:    movl %ebx, %eax
+; X86-NOBMI-NEXT:    jmp .LBB28_5
+; X86-NOBMI-NEXT:  .LBB28_3:
+; X86-NOBMI-NEXT:    movl %ebx, %edx
+; X86-NOBMI-NEXT:  .LBB28_5:
 ; X86-NOBMI-NEXT:    notl %edx
 ; X86-NOBMI-NEXT:    andl %edi, %edx
 ; X86-NOBMI-NEXT:    notl %eax
 ; X86-NOBMI-NEXT:    andl %esi, %eax
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr64_b3_load_indexzext:
@@ -3145,7 +3146,6 @@ define i64 @bextr64_b3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %ebx
-; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %al
 ; X86-BMI1NOTBM-NEXT:    je .LBB28_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
@@ -3162,35 +3162,31 @@ define i64 @bextr64_b3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe
 ; X86-BMI1BMI2-LABEL: bextr64_b3_load_indexzext:
 ; X86-BMI1BMI2:       # %bb.0:
 ; X86-BMI1BMI2-NEXT:    pushl %ebx
-; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1BMI2-NEXT:    movl (%edx), %esi
-; X86-BMI1BMI2-NEXT:    movl 4(%edx), %edi
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edi, %edx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %edi, %esi
+; X86-BMI1BMI2-NEXT:    movl (%edx), %eax
+; X86-BMI1BMI2-NEXT:    movl 4(%edx), %esi
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %edx
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %eax
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB28_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %edx, %esi
+; X86-BMI1BMI2-NEXT:    movl %edx, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:  .LBB28_2:
-; X86-BMI1BMI2-NEXT:    movl $-1, %edi
-; X86-BMI1BMI2-NEXT:    shlxl %eax, %edi, %ebx
-; X86-BMI1BMI2-NEXT:    movl %eax, %ecx
-; X86-BMI1BMI2-NEXT:    shldl %cl, %edi, %edi
-; X86-BMI1BMI2-NEXT:    testb $32, %al
+; X86-BMI1BMI2-NEXT:    movl $-1, %esi
+; X86-BMI1BMI2-NEXT:    shlxl %ebx, %esi, %ecx
+; X86-BMI1BMI2-NEXT:    testb $32, %bl
 ; X86-BMI1BMI2-NEXT:    je .LBB28_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
-; X86-BMI1BMI2-NEXT:    movl %ebx, %edi
-; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
+; X86-BMI1BMI2-NEXT:    movl %ecx, %esi
+; X86-BMI1BMI2-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1BMI2-NEXT:  .LBB28_4:
-; X86-BMI1BMI2-NEXT:    andnl %edx, %edi, %edx
-; X86-BMI1BMI2-NEXT:    andnl %esi, %ebx, %eax
+; X86-BMI1BMI2-NEXT:    andnl %edx, %esi, %edx
+; X86-BMI1BMI2-NEXT:    andnl %eax, %ecx, %eax
 ; X86-BMI1BMI2-NEXT:    popl %esi
-; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
@@ -3235,6 +3231,7 @@ define i64 @bextr64_b3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe
 define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_b4_commutative:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
@@ -3244,6 +3241,7 @@ define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-NOBMI-NEXT:    movl %esi, %edx
 ; X86-NOBMI-NEXT:    shrl %cl, %edx
 ; X86-NOBMI-NEXT:    shrdl %cl, %esi, %eax
+; X86-NOBMI-NEXT:    xorl %esi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB29_2
 ; X86-NOBMI-NEXT:  # %bb.1:
@@ -3251,22 +3249,24 @@ define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:  .LBB29_2:
 ; X86-NOBMI-NEXT:    movl $-1, %edi
-; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:    movl $-1, %ebx
 ; X86-NOBMI-NEXT:    movb %ch, %cl
-; X86-NOBMI-NEXT:    shll %cl, %esi
-; X86-NOBMI-NEXT:    shldl %cl, %edi, %edi
+; X86-NOBMI-NEXT:    shll %cl, %ebx
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB29_4
-; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %esi, %edi
-; X86-NOBMI-NEXT:    xorl %esi, %esi
-; X86-NOBMI-NEXT:  .LBB29_4:
+; X86-NOBMI-NEXT:    jne .LBB29_3
+; X86-NOBMI-NEXT:  # %bb.4:
+; X86-NOBMI-NEXT:    movl %ebx, %esi
+; X86-NOBMI-NEXT:    jmp .LBB29_5
+; X86-NOBMI-NEXT:  .LBB29_3:
+; X86-NOBMI-NEXT:    movl %ebx, %edi
+; X86-NOBMI-NEXT:  .LBB29_5:
 ; X86-NOBMI-NEXT:    notl %edi
 ; X86-NOBMI-NEXT:    andl %edi, %edx
 ; X86-NOBMI-NEXT:    notl %esi
 ; X86-NOBMI-NEXT:    andl %esi, %eax
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr64_b4_commutative:
@@ -3291,7 +3291,6 @@ define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %ebx
-; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %al
 ; X86-BMI1NOTBM-NEXT:    je .LBB29_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
@@ -3308,34 +3307,30 @@ define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-BMI1BMI2-LABEL: bextr64_b4_commutative:
 ; X86-BMI1BMI2:       # %bb.0:
 ; X86-BMI1BMI2-NEXT:    pushl %ebx
-; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %esi
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edx, %edx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB29_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %edx, %esi
+; X86-BMI1BMI2-NEXT:    movl %edx, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:  .LBB29_2:
-; X86-BMI1BMI2-NEXT:    movl $-1, %edi
-; X86-BMI1BMI2-NEXT:    shlxl %eax, %edi, %ebx
-; X86-BMI1BMI2-NEXT:    movl %eax, %ecx
-; X86-BMI1BMI2-NEXT:    shldl %cl, %edi, %edi
-; X86-BMI1BMI2-NEXT:    testb $32, %al
+; X86-BMI1BMI2-NEXT:    movl $-1, %esi
+; X86-BMI1BMI2-NEXT:    shlxl %ebx, %esi, %ecx
+; X86-BMI1BMI2-NEXT:    testb $32, %bl
 ; X86-BMI1BMI2-NEXT:    je .LBB29_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
-; X86-BMI1BMI2-NEXT:    movl %ebx, %edi
-; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
+; X86-BMI1BMI2-NEXT:    movl %ecx, %esi
+; X86-BMI1BMI2-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1BMI2-NEXT:  .LBB29_4:
-; X86-BMI1BMI2-NEXT:    andnl %edx, %edi, %edx
-; X86-BMI1BMI2-NEXT:    andnl %esi, %ebx, %eax
+; X86-BMI1BMI2-NEXT:    andnl %edx, %esi, %edx
+; X86-BMI1BMI2-NEXT:    andnl %eax, %ecx, %eax
 ; X86-BMI1BMI2-NEXT:    popl %esi
-; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
@@ -3379,42 +3374,44 @@ define i64 @bextr64_b5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    subl $12, %esp
-; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl %esi, %ebp
-; X86-NOBMI-NEXT:    movl %eax, %ecx
+; X86-NOBMI-NEXT:    movb %al, %cl
 ; X86-NOBMI-NEXT:    shrl %cl, %ebp
-; X86-NOBMI-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NOBMI-NEXT:    shrdl %cl, %esi, %edx
+; X86-NOBMI-NEXT:    xorl %ebx, %ebx
 ; X86-NOBMI-NEXT:    testb $32, %al
 ; X86-NOBMI-NEXT:    je .LBB30_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %ebp, %ebx
+; X86-NOBMI-NEXT:    movl %ebp, %edx
 ; X86-NOBMI-NEXT:    xorl %ebp, %ebp
 ; X86-NOBMI-NEXT:  .LBB30_2:
-; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    movl $-1, %edi
-; X86-NOBMI-NEXT:    movl %edx, %ecx
-; X86-NOBMI-NEXT:    shll %cl, %edi
-; X86-NOBMI-NEXT:    shldl %cl, %esi, %esi
-; X86-NOBMI-NEXT:    testb $32, %dl
-; X86-NOBMI-NEXT:    je .LBB30_4
-; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %edi, %esi
-; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB30_4:
-; X86-NOBMI-NEXT:    notl %esi
-; X86-NOBMI-NEXT:    andl %ebp, %esi
+; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:    movb %ch, %cl
+; X86-NOBMI-NEXT:    shll %cl, %esi
+; X86-NOBMI-NEXT:    testb $32, %ch
+; X86-NOBMI-NEXT:    jne .LBB30_3
+; X86-NOBMI-NEXT:  # %bb.4:
+; X86-NOBMI-NEXT:    movl %esi, %ebx
+; X86-NOBMI-NEXT:    jmp .LBB30_5
+; X86-NOBMI-NEXT:  .LBB30_3:
+; X86-NOBMI-NEXT:    movl %esi, %edi
+; X86-NOBMI-NEXT:  .LBB30_5:
 ; X86-NOBMI-NEXT:    notl %edi
-; X86-NOBMI-NEXT:    andl %ebx, %edi
+; X86-NOBMI-NEXT:    andl %ebp, %edi
+; X86-NOBMI-NEXT:    notl %ebx
+; X86-NOBMI-NEXT:    andl %edx, %ebx
 ; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NOBMI-NEXT:    pushl %eax
 ; X86-NOBMI-NEXT:    calll use64
 ; X86-NOBMI-NEXT:    addl $16, %esp
-; X86-NOBMI-NEXT:    movl %edi, %eax
-; X86-NOBMI-NEXT:    movl %esi, %edx
+; X86-NOBMI-NEXT:    movl %ebx, %eax
+; X86-NOBMI-NEXT:    movl %edi, %edx
 ; X86-NOBMI-NEXT:    addl $12, %esp
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
@@ -3447,7 +3444,6 @@ define i64 @bextr64_b5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
 ; X86-BMI1NOTBM-NEXT:    movl %edx, %ecx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %ebp
-; X86-BMI1NOTBM-NEXT:    shldl %cl, %ebx, %ebx
 ; X86-BMI1NOTBM-NEXT:    testb $32, %dl
 ; X86-BMI1NOTBM-NEXT:    je .LBB30_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
@@ -3477,34 +3473,32 @@ define i64 @bextr64_b5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
 ; X86-BMI1BMI2-NEXT:    subl $12, %esp
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    movl %eax, %ecx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %edi
-; X86-BMI1BMI2-NEXT:    shrxl %eax, %esi, %esi
-; X86-BMI1BMI2-NEXT:    testb $32, %al
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %eax
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edx, %edx
+; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB30_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %esi, %edi
-; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
+; X86-BMI1BMI2-NEXT:    movl %edx, %eax
+; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:  .LBB30_2:
-; X86-BMI1BMI2-NEXT:    movl $-1, %ebp
-; X86-BMI1BMI2-NEXT:    shlxl %edx, %ebp, %ebx
-; X86-BMI1BMI2-NEXT:    movl %edx, %ecx
-; X86-BMI1BMI2-NEXT:    shldl %cl, %ebp, %ebp
-; X86-BMI1BMI2-NEXT:    testb $32, %dl
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-BMI1BMI2-NEXT:    movl $-1, %esi
+; X86-BMI1BMI2-NEXT:    shlxl %ebx, %esi, %edi
+; X86-BMI1BMI2-NEXT:    testb $32, %bl
 ; X86-BMI1BMI2-NEXT:    je .LBB30_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
-; X86-BMI1BMI2-NEXT:    movl %ebx, %ebp
-; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
+; X86-BMI1BMI2-NEXT:    movl %edi, %esi
+; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI1BMI2-NEXT:  .LBB30_4:
-; X86-BMI1BMI2-NEXT:    andnl %esi, %ebp, %esi
-; X86-BMI1BMI2-NEXT:    andnl %edi, %ebx, %edi
+; X86-BMI1BMI2-NEXT:    andnl %edx, %esi, %esi
+; X86-BMI1BMI2-NEXT:    andnl %eax, %edi, %edi
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
-; X86-BMI1BMI2-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-BMI1BMI2-NEXT:    pushl %eax
+; X86-BMI1BMI2-NEXT:    pushl %ebp
+; X86-BMI1BMI2-NEXT:    pushl %ecx
 ; X86-BMI1BMI2-NEXT:    calll use64
 ; X86-BMI1BMI2-NEXT:    addl $16, %esp
 ; X86-BMI1BMI2-NEXT:    movl %edi, %eax
@@ -4888,7 +4882,6 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    movl $-1, %ebp
 ; X86-NOBMI-NEXT:    movl $-1, %ebx
 ; X86-NOBMI-NEXT:    shrl %cl, %ebx
-; X86-NOBMI-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB41_4
 ; X86-NOBMI-NEXT:  # %bb.3:
@@ -4935,7 +4928,6 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB41_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
@@ -4976,24 +4968,23 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI1BMI2-NEXT:  .LBB41_2:
-; X86-BMI1BMI2-NEXT:    movb $64, %cl
-; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    movl $-1, %ebx
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebx, %ebx
-; X86-BMI1BMI2-NEXT:    testb $32, %cl
+; X86-BMI1BMI2-NEXT:    movb $64, %al
+; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
+; X86-BMI1BMI2-NEXT:    movl $-1, %ebp
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %ebp, %ebx
+; X86-BMI1BMI2-NEXT:    testb $32, %al
 ; X86-BMI1BMI2-NEXT:    je .LBB41_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
-; X86-BMI1BMI2-NEXT:    movl %ebp, %ebx
-; X86-BMI1BMI2-NEXT:    xorl %ebp, %ebp
+; X86-BMI1BMI2-NEXT:    movl %ebx, %ebp
+; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:  .LBB41_4:
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
-; X86-BMI1BMI2-NEXT:    pushl %ebp
 ; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    pushl %ebp
 ; X86-BMI1BMI2-NEXT:    calll use64
 ; X86-BMI1BMI2-NEXT:    addl $16, %esp
-; X86-BMI1BMI2-NEXT:    andl %ebx, %esi
-; X86-BMI1BMI2-NEXT:    andl %ebp, %edi
+; X86-BMI1BMI2-NEXT:    andl %ebp, %esi
+; X86-BMI1BMI2-NEXT:    andl %ebx, %edi
 ; X86-BMI1BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI1BMI2-NEXT:    movl %edi, %edx
 ; X86-BMI1BMI2-NEXT:    addl $12, %esp
@@ -5097,7 +5088,6 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X86-NOBMI-NEXT:    movl $-1, %ebp
 ; X86-NOBMI-NEXT:    movl $-1, %ebx
 ; X86-NOBMI-NEXT:    shrl %cl, %ebx
-; X86-NOBMI-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB42_4
 ; X86-NOBMI-NEXT:  # %bb.3:
@@ -5144,7 +5134,6 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB42_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
@@ -5185,24 +5174,23 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI1BMI2-NEXT:  .LBB42_2:
-; X86-BMI1BMI2-NEXT:    movb $64, %cl
-; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    movl $-1, %ebx
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebx, %ebx
-; X86-BMI1BMI2-NEXT:    testb $32, %cl
+; X86-BMI1BMI2-NEXT:    movb $64, %al
+; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
+; X86-BMI1BMI2-NEXT:    movl $-1, %ebp
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %ebp, %ebx
+; X86-BMI1BMI2-NEXT:    testb $32, %al
 ; X86-BMI1BMI2-NEXT:    je .LBB42_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
-; X86-BMI1BMI2-NEXT:    movl %ebp, %ebx
-; X86-BMI1BMI2-NEXT:    xorl %ebp, %ebp
+; X86-BMI1BMI2-NEXT:    movl %ebx, %ebp
+; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:  .LBB42_4:
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
-; X86-BMI1BMI2-NEXT:    pushl %ebp
 ; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    pushl %ebp
 ; X86-BMI1BMI2-NEXT:    calll use64
 ; X86-BMI1BMI2-NEXT:    addl $16, %esp
-; X86-BMI1BMI2-NEXT:    andl %ebx, %esi
-; X86-BMI1BMI2-NEXT:    andl %ebp, %edi
+; X86-BMI1BMI2-NEXT:    andl %ebp, %esi
+; X86-BMI1BMI2-NEXT:    andl %ebx, %edi
 ; X86-BMI1BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI1BMI2-NEXT:    movl %edi, %edx
 ; X86-BMI1BMI2-NEXT:    addl $12, %esp
@@ -5310,7 +5298,6 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-NOBMI-NEXT:    movl $-1, %ebp
 ; X86-NOBMI-NEXT:    movl $-1, %ebx
 ; X86-NOBMI-NEXT:    shrl %cl, %ebx
-; X86-NOBMI-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB43_4
 ; X86-NOBMI-NEXT:  # %bb.3:
@@ -5358,7 +5345,6 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB43_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
@@ -5400,24 +5386,23 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI1BMI2-NEXT:  .LBB43_2:
-; X86-BMI1BMI2-NEXT:    movb $64, %cl
-; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    movl $-1, %ebx
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebx, %ebx
-; X86-BMI1BMI2-NEXT:    testb $32, %cl
+; X86-BMI1BMI2-NEXT:    movb $64, %al
+; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
+; X86-BMI1BMI2-NEXT:    movl $-1, %ebp
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %ebp, %ebx
+; X86-BMI1BMI2-NEXT:    testb $32, %al
 ; X86-BMI1BMI2-NEXT:    je .LBB43_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
-; X86-BMI1BMI2-NEXT:    movl %ebp, %ebx
-; X86-BMI1BMI2-NEXT:    xorl %ebp, %ebp
+; X86-BMI1BMI2-NEXT:    movl %ebx, %ebp
+; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:  .LBB43_4:
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
-; X86-BMI1BMI2-NEXT:    pushl %ebp
 ; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    pushl %ebp
 ; X86-BMI1BMI2-NEXT:    calll use64
 ; X86-BMI1BMI2-NEXT:    addl $16, %esp
-; X86-BMI1BMI2-NEXT:    andl %ebx, %esi
-; X86-BMI1BMI2-NEXT:    andl %ebp, %edi
+; X86-BMI1BMI2-NEXT:    andl %ebp, %esi
+; X86-BMI1BMI2-NEXT:    andl %ebx, %edi
 ; X86-BMI1BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI1BMI2-NEXT:    movl %edi, %edx
 ; X86-BMI1BMI2-NEXT:    addl $12, %esp
@@ -5523,7 +5508,6 @@ define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ; X86-NOBMI-NEXT:    movl $-1, %ebp
 ; X86-NOBMI-NEXT:    movl $-1, %ebx
 ; X86-NOBMI-NEXT:    shrl %cl, %ebx
-; X86-NOBMI-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB44_4
 ; X86-NOBMI-NEXT:  # %bb.3:
@@ -5571,7 +5555,6 @@ define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB44_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
@@ -5613,24 +5596,23 @@ define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI1BMI2-NEXT:  .LBB44_2:
-; X86-BMI1BMI2-NEXT:    movb $64, %cl
-; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    movl $-1, %ebx
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebx, %ebx
-; X86-BMI1BMI2-NEXT:    testb $32, %cl
+; X86-BMI1BMI2-NEXT:    movb $64, %al
+; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
+; X86-BMI1BMI2-NEXT:    movl $-1, %ebp
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %ebp, %ebx
+; X86-BMI1BMI2-NEXT:    testb $32, %al
 ; X86-BMI1BMI2-NEXT:    je .LBB44_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
-; X86-BMI1BMI2-NEXT:    movl %ebp, %ebx
-; X86-BMI1BMI2-NEXT:    xorl %ebp, %ebp
+; X86-BMI1BMI2-NEXT:    movl %ebx, %ebp
+; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:  .LBB44_4:
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
-; X86-BMI1BMI2-NEXT:    pushl %ebp
 ; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    pushl %ebp
 ; X86-BMI1BMI2-NEXT:    calll use64
 ; X86-BMI1BMI2-NEXT:    addl $16, %esp
-; X86-BMI1BMI2-NEXT:    andl %ebx, %esi
-; X86-BMI1BMI2-NEXT:    andl %ebp, %edi
+; X86-BMI1BMI2-NEXT:    andl %ebp, %esi
+; X86-BMI1BMI2-NEXT:    andl %ebx, %edi
 ; X86-BMI1BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI1BMI2-NEXT:    movl %edi, %edx
 ; X86-BMI1BMI2-NEXT:    addl $12, %esp
@@ -5738,7 +5720,6 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-NOBMI-NEXT:    movl $-1, %ebp
 ; X86-NOBMI-NEXT:    movl $-1, %ebx
 ; X86-NOBMI-NEXT:    shrl %cl, %ebx
-; X86-NOBMI-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB45_4
 ; X86-NOBMI-NEXT:  # %bb.3:
@@ -5785,7 +5766,6 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB45_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
@@ -5826,24 +5806,23 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI1BMI2-NEXT:  .LBB45_2:
-; X86-BMI1BMI2-NEXT:    movb $64, %cl
-; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    movl $-1, %ebx
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebx, %ebx
-; X86-BMI1BMI2-NEXT:    testb $32, %cl
+; X86-BMI1BMI2-NEXT:    movb $64, %al
+; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
+; X86-BMI1BMI2-NEXT:    movl $-1, %ebp
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %ebp, %ebx
+; X86-BMI1BMI2-NEXT:    testb $32, %al
 ; X86-BMI1BMI2-NEXT:    je .LBB45_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
-; X86-BMI1BMI2-NEXT:    movl %ebp, %ebx
-; X86-BMI1BMI2-NEXT:    xorl %ebp, %ebp
+; X86-BMI1BMI2-NEXT:    movl %ebx, %ebp
+; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:  .LBB45_4:
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
-; X86-BMI1BMI2-NEXT:    pushl %ebp
 ; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    pushl %ebp
 ; X86-BMI1BMI2-NEXT:    calll use64
 ; X86-BMI1BMI2-NEXT:    addl $16, %esp
-; X86-BMI1BMI2-NEXT:    andl %ebx, %esi
-; X86-BMI1BMI2-NEXT:    andl %ebp, %edi
+; X86-BMI1BMI2-NEXT:    andl %ebp, %esi
+; X86-BMI1BMI2-NEXT:    andl %ebx, %edi
 ; X86-BMI1BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI1BMI2-NEXT:    movl %edi, %edx
 ; X86-BMI1BMI2-NEXT:    addl $12, %esp
@@ -5947,7 +5926,6 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-NOBMI-NEXT:    movl $-1, %ebx
 ; X86-NOBMI-NEXT:    movl $-1, %ebp
 ; X86-NOBMI-NEXT:    shrl %cl, %ebp
-; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %ebx
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB46_4
 ; X86-NOBMI-NEXT:  # %bb.3:
@@ -5999,7 +5977,6 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebp
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebx, %ebx
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB46_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
@@ -6045,12 +6022,11 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI1BMI2-NEXT:  .LBB46_2:
-; X86-BMI1BMI2-NEXT:    movb $64, %cl
-; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI1BMI2-NEXT:    movb $64, %al
+; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
 ; X86-BMI1BMI2-NEXT:    movl $-1, %ebp
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebp, %ebx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebp, %ebp
-; X86-BMI1BMI2-NEXT:    testb $32, %cl
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %ebp, %ebx
+; X86-BMI1BMI2-NEXT:    testb $32, %al
 ; X86-BMI1BMI2-NEXT:    je .LBB46_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %ebp
@@ -6175,14 +6151,12 @@ define i32 @bextr64_32_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-NOBMI-NEXT:  .LBB47_2:
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
-; X86-NOBMI-NEXT:    shrdl %cl, %esi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    jne .LBB47_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:  .LBB47_4:
 ; X86-NOBMI-NEXT:    andl %edx, %eax
 ; X86-NOBMI-NEXT:    popl %esi
@@ -6204,14 +6178,12 @@ define i32 @bextr64_32_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1NOTBM-NEXT:  .LBB47_2:
 ; X86-BMI1NOTBM-NEXT:    movb $64, %cl
 ; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %esi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    jne .LBB47_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
-; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
 ; X86-BMI1NOTBM-NEXT:  .LBB47_4:
 ; X86-BMI1NOTBM-NEXT:    andl %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    popl %esi
@@ -6219,7 +6191,6 @@ define i32 @bextr64_32_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind
 ;
 ; X86-BMI1BMI2-LABEL: bextr64_32_c0:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    pushl %esi
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -6231,16 +6202,13 @@ define i32 @bextr64_32_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1BMI2-NEXT:  .LBB47_2:
 ; X86-BMI1BMI2-NEXT:    movb $64, %cl
 ; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    movl $-1, %esi
 ; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %eax
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB47_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %eax
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %eax
 ; X86-BMI1BMI2-NEXT:  .LBB47_4:
 ; X86-BMI1BMI2-NEXT:    andl %edx, %eax
-; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_32_c0:

diff  --git a/llvm/test/CodeGen/X86/extract-lowbits.ll b/llvm/test/CodeGen/X86/extract-lowbits.ll
index c7fa8617e072..e93638132e7d 100644
--- a/llvm/test/CodeGen/X86/extract-lowbits.ll
+++ b/llvm/test/CodeGen/X86/extract-lowbits.ll
@@ -1356,58 +1356,56 @@ define i32 @bzhi32_b4_commutative(i32 %val, i32 %numlowbits) nounwind {
 define i64 @bzhi64_b0(i64 %val, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_b0:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    shll %cl, %eax
-; X86-NOBMI-NEXT:    shldl %cl, %edx, %edx
-; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB20_2
-; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %eax, %edx
+; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:    shll %cl, %esi
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB20_2:
+; X86-NOBMI-NEXT:    testb $32, %cl
+; X86-NOBMI-NEXT:    jne .LBB20_1
+; X86-NOBMI-NEXT:  # %bb.2:
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    jmp .LBB20_3
+; X86-NOBMI-NEXT:  .LBB20_1:
+; X86-NOBMI-NEXT:    movl %esi, %edx
+; X86-NOBMI-NEXT:  .LBB20_3:
 ; X86-NOBMI-NEXT:    notl %edx
 ; X86-NOBMI-NEXT:    notl %eax
-; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi64_b0:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
-; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X86-BMI1NOTBM-NEXT:    shldl %cl, %eax, %eax
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB20_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
-; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %esi, %esi
+; X86-BMI1NOTBM-NEXT:    movl %eax, %edx
+; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
 ; X86-BMI1NOTBM-NEXT:  .LBB20_2:
-; X86-BMI1NOTBM-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %edx
-; X86-BMI1NOTBM-NEXT:    andnl {{[0-9]+}}(%esp), %esi, %eax
-; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
+; X86-BMI1NOTBM-NEXT:    andnl {{[0-9]+}}(%esp), %edx, %edx
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi64_b0:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    pushl %esi
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shlxl %ecx, %eax, %esi
-; X86-BMI1BMI2-NEXT:    shldl %cl, %eax, %eax
-; X86-BMI1BMI2-NEXT:    testb $32, %cl
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI1BMI2-NEXT:    shlxl %edx, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    testb $32, %dl
 ; X86-BMI1BMI2-NEXT:    je .LBB20_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %esi, %eax
-; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
+; X86-BMI1BMI2-NEXT:    movl %eax, %ecx
+; X86-BMI1BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI1BMI2-NEXT:  .LBB20_2:
-; X86-BMI1BMI2-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %edx
-; X86-BMI1BMI2-NEXT:    andnl {{[0-9]+}}(%esp), %esi, %eax
-; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
+; X86-BMI1BMI2-NEXT:    andnl {{[0-9]+}}(%esp), %ecx, %edx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_b0:
@@ -1439,58 +1437,56 @@ define i64 @bzhi64_b0(i64 %val, i64 %numlowbits) nounwind {
 define i64 @bzhi64_b1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_b1_indexzext:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    shll %cl, %eax
-; X86-NOBMI-NEXT:    shldl %cl, %edx, %edx
-; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB21_2
-; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %eax, %edx
+; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:    shll %cl, %esi
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB21_2:
+; X86-NOBMI-NEXT:    testb $32, %cl
+; X86-NOBMI-NEXT:    jne .LBB21_1
+; X86-NOBMI-NEXT:  # %bb.2:
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    jmp .LBB21_3
+; X86-NOBMI-NEXT:  .LBB21_1:
+; X86-NOBMI-NEXT:    movl %esi, %edx
+; X86-NOBMI-NEXT:  .LBB21_3:
 ; X86-NOBMI-NEXT:    notl %edx
 ; X86-NOBMI-NEXT:    notl %eax
-; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi64_b1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
-; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X86-BMI1NOTBM-NEXT:    shldl %cl, %eax, %eax
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB21_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
-; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %esi, %esi
+; X86-BMI1NOTBM-NEXT:    movl %eax, %edx
+; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
 ; X86-BMI1NOTBM-NEXT:  .LBB21_2:
-; X86-BMI1NOTBM-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %edx
-; X86-BMI1NOTBM-NEXT:    andnl {{[0-9]+}}(%esp), %esi, %eax
-; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
+; X86-BMI1NOTBM-NEXT:    andnl {{[0-9]+}}(%esp), %edx, %edx
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi64_b1_indexzext:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    pushl %esi
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shlxl %ecx, %eax, %esi
-; X86-BMI1BMI2-NEXT:    shldl %cl, %eax, %eax
-; X86-BMI1BMI2-NEXT:    testb $32, %cl
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI1BMI2-NEXT:    shlxl %edx, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    testb $32, %dl
 ; X86-BMI1BMI2-NEXT:    je .LBB21_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %esi, %eax
-; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
+; X86-BMI1BMI2-NEXT:    movl %eax, %ecx
+; X86-BMI1BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI1BMI2-NEXT:  .LBB21_2:
-; X86-BMI1BMI2-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %edx
-; X86-BMI1BMI2-NEXT:    andnl {{[0-9]+}}(%esp), %esi, %eax
-; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
+; X86-BMI1BMI2-NEXT:    andnl {{[0-9]+}}(%esp), %ecx, %edx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_b1_indexzext:
@@ -1525,63 +1521,65 @@ define i64 @bzhi64_b1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind {
 define i64 @bzhi64_b2_load(i64* %w, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_b2_load:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    shll %cl, %eax
-; X86-NOBMI-NEXT:    shldl %cl, %edx, %edx
-; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB22_2
-; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %eax, %edx
+; X86-NOBMI-NEXT:    movl $-1, %edi
+; X86-NOBMI-NEXT:    shll %cl, %edi
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB22_2:
+; X86-NOBMI-NEXT:    testb $32, %cl
+; X86-NOBMI-NEXT:    jne .LBB22_1
+; X86-NOBMI-NEXT:  # %bb.2:
+; X86-NOBMI-NEXT:    movl %edi, %eax
+; X86-NOBMI-NEXT:    jmp .LBB22_3
+; X86-NOBMI-NEXT:  .LBB22_1:
+; X86-NOBMI-NEXT:    movl %edi, %edx
+; X86-NOBMI-NEXT:  .LBB22_3:
 ; X86-NOBMI-NEXT:    notl %edx
 ; X86-NOBMI-NEXT:    notl %eax
-; X86-NOBMI-NEXT:    andl 4(%esi), %edx
 ; X86-NOBMI-NEXT:    andl (%esi), %eax
+; X86-NOBMI-NEXT:    andl 4(%esi), %edx
 ; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi64_b2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
-; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X86-BMI1NOTBM-NEXT:    shldl %cl, %edx, %edx
+; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB22_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
-; X86-BMI1NOTBM-NEXT:    movl %esi, %edx
-; X86-BMI1NOTBM-NEXT:    xorl %esi, %esi
+; X86-BMI1NOTBM-NEXT:    movl %eax, %esi
+; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
 ; X86-BMI1NOTBM-NEXT:  .LBB22_2:
-; X86-BMI1NOTBM-NEXT:    andnl 4(%eax), %edx, %edx
-; X86-BMI1NOTBM-NEXT:    andnl (%eax), %esi, %eax
+; X86-BMI1NOTBM-NEXT:    andnl (%edx), %eax, %eax
+; X86-BMI1NOTBM-NEXT:    andnl 4(%edx), %esi, %edx
 ; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi64_b2_load:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    pushl %esi
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-BMI1BMI2-NEXT:    movl $-1, %edx
-; X86-BMI1BMI2-NEXT:    shlxl %ecx, %edx, %esi
-; X86-BMI1BMI2-NEXT:    shldl %cl, %edx, %edx
-; X86-BMI1BMI2-NEXT:    testb $32, %cl
+; X86-BMI1BMI2-NEXT:    shlxl %ebx, %edx, %eax
+; X86-BMI1BMI2-NEXT:    testb $32, %bl
 ; X86-BMI1BMI2-NEXT:    je .LBB22_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %esi, %edx
-; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
+; X86-BMI1BMI2-NEXT:    movl %eax, %edx
+; X86-BMI1BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI1BMI2-NEXT:  .LBB22_2:
-; X86-BMI1BMI2-NEXT:    andnl 4(%eax), %edx, %edx
-; X86-BMI1BMI2-NEXT:    andnl (%eax), %esi, %eax
-; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    andnl (%ecx), %eax, %eax
+; X86-BMI1BMI2-NEXT:    andnl 4(%ecx), %edx, %edx
+; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_b2_load:
@@ -1614,63 +1612,65 @@ define i64 @bzhi64_b2_load(i64* %w, i64 %numlowbits) nounwind {
 define i64 @bzhi64_b3_load_indexzext(i64* %w, i8 zeroext %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_b3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    shll %cl, %eax
-; X86-NOBMI-NEXT:    shldl %cl, %edx, %edx
-; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB23_2
-; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %eax, %edx
+; X86-NOBMI-NEXT:    movl $-1, %edi
+; X86-NOBMI-NEXT:    shll %cl, %edi
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB23_2:
+; X86-NOBMI-NEXT:    testb $32, %cl
+; X86-NOBMI-NEXT:    jne .LBB23_1
+; X86-NOBMI-NEXT:  # %bb.2:
+; X86-NOBMI-NEXT:    movl %edi, %eax
+; X86-NOBMI-NEXT:    jmp .LBB23_3
+; X86-NOBMI-NEXT:  .LBB23_1:
+; X86-NOBMI-NEXT:    movl %edi, %edx
+; X86-NOBMI-NEXT:  .LBB23_3:
 ; X86-NOBMI-NEXT:    notl %edx
 ; X86-NOBMI-NEXT:    notl %eax
-; X86-NOBMI-NEXT:    andl 4(%esi), %edx
 ; X86-NOBMI-NEXT:    andl (%esi), %eax
+; X86-NOBMI-NEXT:    andl 4(%esi), %edx
 ; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi64_b3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
-; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X86-BMI1NOTBM-NEXT:    shldl %cl, %edx, %edx
+; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB23_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
-; X86-BMI1NOTBM-NEXT:    movl %esi, %edx
-; X86-BMI1NOTBM-NEXT:    xorl %esi, %esi
+; X86-BMI1NOTBM-NEXT:    movl %eax, %esi
+; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
 ; X86-BMI1NOTBM-NEXT:  .LBB23_2:
-; X86-BMI1NOTBM-NEXT:    andnl 4(%eax), %edx, %edx
-; X86-BMI1NOTBM-NEXT:    andnl (%eax), %esi, %eax
+; X86-BMI1NOTBM-NEXT:    andnl (%edx), %eax, %eax
+; X86-BMI1NOTBM-NEXT:    andnl 4(%edx), %esi, %edx
 ; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi64_b3_load_indexzext:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    pushl %esi
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-BMI1BMI2-NEXT:    movl $-1, %edx
-; X86-BMI1BMI2-NEXT:    shlxl %ecx, %edx, %esi
-; X86-BMI1BMI2-NEXT:    shldl %cl, %edx, %edx
-; X86-BMI1BMI2-NEXT:    testb $32, %cl
+; X86-BMI1BMI2-NEXT:    shlxl %ebx, %edx, %eax
+; X86-BMI1BMI2-NEXT:    testb $32, %bl
 ; X86-BMI1BMI2-NEXT:    je .LBB23_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %esi, %edx
-; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
+; X86-BMI1BMI2-NEXT:    movl %eax, %edx
+; X86-BMI1BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI1BMI2-NEXT:  .LBB23_2:
-; X86-BMI1BMI2-NEXT:    andnl 4(%eax), %edx, %edx
-; X86-BMI1BMI2-NEXT:    andnl (%eax), %esi, %eax
-; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    andnl (%ecx), %eax, %eax
+; X86-BMI1BMI2-NEXT:    andnl 4(%ecx), %edx, %edx
+; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_b3_load_indexzext:
@@ -1706,58 +1706,56 @@ define i64 @bzhi64_b3_load_indexzext(i64* %w, i8 zeroext %numlowbits) nounwind {
 define i64 @bzhi64_b4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_b4_commutative:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    shll %cl, %eax
-; X86-NOBMI-NEXT:    shldl %cl, %edx, %edx
-; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB24_2
-; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %eax, %edx
+; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:    shll %cl, %esi
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB24_2:
+; X86-NOBMI-NEXT:    testb $32, %cl
+; X86-NOBMI-NEXT:    jne .LBB24_1
+; X86-NOBMI-NEXT:  # %bb.2:
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    jmp .LBB24_3
+; X86-NOBMI-NEXT:  .LBB24_1:
+; X86-NOBMI-NEXT:    movl %esi, %edx
+; X86-NOBMI-NEXT:  .LBB24_3:
 ; X86-NOBMI-NEXT:    notl %edx
 ; X86-NOBMI-NEXT:    notl %eax
-; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi64_b4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
-; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X86-BMI1NOTBM-NEXT:    shldl %cl, %eax, %eax
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB24_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
-; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %esi, %esi
+; X86-BMI1NOTBM-NEXT:    movl %eax, %edx
+; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
 ; X86-BMI1NOTBM-NEXT:  .LBB24_2:
-; X86-BMI1NOTBM-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %edx
-; X86-BMI1NOTBM-NEXT:    andnl {{[0-9]+}}(%esp), %esi, %eax
-; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
+; X86-BMI1NOTBM-NEXT:    andnl {{[0-9]+}}(%esp), %edx, %edx
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi64_b4_commutative:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    pushl %esi
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shlxl %ecx, %eax, %esi
-; X86-BMI1BMI2-NEXT:    shldl %cl, %eax, %eax
-; X86-BMI1BMI2-NEXT:    testb $32, %cl
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI1BMI2-NEXT:    shlxl %edx, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    testb $32, %dl
 ; X86-BMI1BMI2-NEXT:    je .LBB24_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %esi, %eax
-; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
+; X86-BMI1BMI2-NEXT:    movl %eax, %ecx
+; X86-BMI1BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI1BMI2-NEXT:  .LBB24_2:
-; X86-BMI1BMI2-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %edx
-; X86-BMI1BMI2-NEXT:    andnl {{[0-9]+}}(%esp), %esi, %eax
-; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
+; X86-BMI1BMI2-NEXT:    andnl {{[0-9]+}}(%esp), %ecx, %edx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_b4_commutative:
@@ -2628,7 +2626,6 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    movl $-1, %edi
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
-; X86-NOBMI-NEXT:    shrdl %cl, %esi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB34_2
 ; X86-NOBMI-NEXT:  # %bb.1:
@@ -2659,7 +2656,6 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %esi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB34_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
@@ -2685,26 +2681,25 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind {
 ; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
 ; X86-BMI1BMI2-NEXT:    pushl %eax
-; X86-BMI1BMI2-NEXT:    movb $64, %cl
-; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    movl $-1, %esi
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %edi
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %esi
-; X86-BMI1BMI2-NEXT:    testb $32, %cl
+; X86-BMI1BMI2-NEXT:    movb $64, %al
+; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
+; X86-BMI1BMI2-NEXT:    movl $-1, %edi
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %edi, %esi
+; X86-BMI1BMI2-NEXT:    testb $32, %al
 ; X86-BMI1BMI2-NEXT:    je .LBB34_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %edi, %esi
-; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %edi
+; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
 ; X86-BMI1BMI2-NEXT:  .LBB34_2:
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
-; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    calll use64
 ; X86-BMI1BMI2-NEXT:    addl $16, %esp
-; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edi
-; X86-BMI1BMI2-NEXT:    movl %esi, %eax
-; X86-BMI1BMI2-NEXT:    movl %edi, %edx
+; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movl %edi, %eax
+; X86-BMI1BMI2-NEXT:    movl %esi, %edx
 ; X86-BMI1BMI2-NEXT:    addl $4, %esp
 ; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    popl %edi
@@ -2785,7 +2780,6 @@ define i64 @bzhi64_c1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    movl $-1, %edi
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
-; X86-NOBMI-NEXT:    shrdl %cl, %esi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB35_2
 ; X86-NOBMI-NEXT:  # %bb.1:
@@ -2816,7 +2810,6 @@ define i64 @bzhi64_c1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %esi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB35_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
@@ -2842,26 +2835,25 @@ define i64 @bzhi64_c1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 ; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
 ; X86-BMI1BMI2-NEXT:    pushl %eax
-; X86-BMI1BMI2-NEXT:    movb $64, %cl
-; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    movl $-1, %esi
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %edi
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %esi
-; X86-BMI1BMI2-NEXT:    testb $32, %cl
+; X86-BMI1BMI2-NEXT:    movb $64, %al
+; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
+; X86-BMI1BMI2-NEXT:    movl $-1, %edi
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %edi, %esi
+; X86-BMI1BMI2-NEXT:    testb $32, %al
 ; X86-BMI1BMI2-NEXT:    je .LBB35_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %edi, %esi
-; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %edi
+; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
 ; X86-BMI1BMI2-NEXT:  .LBB35_2:
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
-; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    calll use64
 ; X86-BMI1BMI2-NEXT:    addl $16, %esp
-; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edi
-; X86-BMI1BMI2-NEXT:    movl %esi, %eax
-; X86-BMI1BMI2-NEXT:    movl %edi, %edx
+; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movl %edi, %eax
+; X86-BMI1BMI2-NEXT:    movl %esi, %edx
 ; X86-BMI1BMI2-NEXT:    addl $4, %esp
 ; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    popl %edi
@@ -2944,24 +2936,23 @@ define i64 @bzhi64_c2_load(i64* %w, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    movl $-1, %ebx
 ; X86-NOBMI-NEXT:    shrl %cl, %ebx
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB36_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %ebx, %eax
 ; X86-NOBMI-NEXT:    xorl %ebx, %ebx
 ; X86-NOBMI-NEXT:  .LBB36_2:
-; X86-NOBMI-NEXT:    movl (%edx), %esi
-; X86-NOBMI-NEXT:    andl %eax, %esi
-; X86-NOBMI-NEXT:    movl 4(%edx), %edi
-; X86-NOBMI-NEXT:    andl %ebx, %edi
+; X86-NOBMI-NEXT:    movl 4(%edx), %esi
+; X86-NOBMI-NEXT:    andl %ebx, %esi
+; X86-NOBMI-NEXT:    movl (%edx), %edi
+; X86-NOBMI-NEXT:    andl %eax, %edi
 ; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %eax
 ; X86-NOBMI-NEXT:    calll use64
 ; X86-NOBMI-NEXT:    addl $16, %esp
-; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:    movl %edi, %edx
+; X86-NOBMI-NEXT:    movl %edi, %eax
+; X86-NOBMI-NEXT:    movl %esi, %edx
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    popl %ebx
@@ -2978,24 +2969,23 @@ define i64 @bzhi64_c2_load(i64* %w, i64 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB36_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %eax
 ; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1NOTBM-NEXT:  .LBB36_2:
-; X86-BMI1NOTBM-NEXT:    movl (%edx), %esi
-; X86-BMI1NOTBM-NEXT:    andl %eax, %esi
-; X86-BMI1NOTBM-NEXT:    movl 4(%edx), %edi
-; X86-BMI1NOTBM-NEXT:    andl %ebx, %edi
+; X86-BMI1NOTBM-NEXT:    movl 4(%edx), %esi
+; X86-BMI1NOTBM-NEXT:    andl %ebx, %esi
+; X86-BMI1NOTBM-NEXT:    movl (%edx), %edi
+; X86-BMI1NOTBM-NEXT:    andl %eax, %edi
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    pushl %ebx
 ; X86-BMI1NOTBM-NEXT:    pushl %eax
 ; X86-BMI1NOTBM-NEXT:    calll use64
 ; X86-BMI1NOTBM-NEXT:    addl $16, %esp
-; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
-; X86-BMI1NOTBM-NEXT:    movl %edi, %edx
+; X86-BMI1NOTBM-NEXT:    movl %edi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %esi, %edx
 ; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    popl %ebx
@@ -3006,29 +2996,28 @@ define i64 @bzhi64_c2_load(i64* %w, i64 %numlowbits) nounwind {
 ; X86-BMI1BMI2-NEXT:    pushl %ebx
 ; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1BMI2-NEXT:    movb $64, %cl
-; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %ebx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %eax
-; X86-BMI1BMI2-NEXT:    testb $32, %cl
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1BMI2-NEXT:    movb $64, %bl
+; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %bl
+; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI1BMI2-NEXT:    shrxl %ebx, %ecx, %edx
+; X86-BMI1BMI2-NEXT:    testb $32, %bl
 ; X86-BMI1BMI2-NEXT:    je .LBB36_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %ebx, %eax
-; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
+; X86-BMI1BMI2-NEXT:    movl %edx, %ecx
+; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:  .LBB36_2:
-; X86-BMI1BMI2-NEXT:    movl (%edx), %esi
-; X86-BMI1BMI2-NEXT:    andl %eax, %esi
-; X86-BMI1BMI2-NEXT:    movl 4(%edx), %edi
-; X86-BMI1BMI2-NEXT:    andl %ebx, %edi
+; X86-BMI1BMI2-NEXT:    movl 4(%eax), %esi
+; X86-BMI1BMI2-NEXT:    andl %edx, %esi
+; X86-BMI1BMI2-NEXT:    movl (%eax), %edi
+; X86-BMI1BMI2-NEXT:    andl %ecx, %edi
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
-; X86-BMI1BMI2-NEXT:    pushl %ebx
-; X86-BMI1BMI2-NEXT:    pushl %eax
+; X86-BMI1BMI2-NEXT:    pushl %edx
+; X86-BMI1BMI2-NEXT:    pushl %ecx
 ; X86-BMI1BMI2-NEXT:    calll use64
 ; X86-BMI1BMI2-NEXT:    addl $16, %esp
-; X86-BMI1BMI2-NEXT:    movl %esi, %eax
-; X86-BMI1BMI2-NEXT:    movl %edi, %edx
+; X86-BMI1BMI2-NEXT:    movl %edi, %eax
+; X86-BMI1BMI2-NEXT:    movl %esi, %edx
 ; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    popl %ebx
@@ -3098,24 +3087,23 @@ define i64 @bzhi64_c3_load_indexzext(i64* %w, i8 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    movl $-1, %ebx
 ; X86-NOBMI-NEXT:    shrl %cl, %ebx
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB37_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %ebx, %eax
 ; X86-NOBMI-NEXT:    xorl %ebx, %ebx
 ; X86-NOBMI-NEXT:  .LBB37_2:
-; X86-NOBMI-NEXT:    movl (%edx), %esi
-; X86-NOBMI-NEXT:    andl %eax, %esi
-; X86-NOBMI-NEXT:    movl 4(%edx), %edi
-; X86-NOBMI-NEXT:    andl %ebx, %edi
+; X86-NOBMI-NEXT:    movl 4(%edx), %esi
+; X86-NOBMI-NEXT:    andl %ebx, %esi
+; X86-NOBMI-NEXT:    movl (%edx), %edi
+; X86-NOBMI-NEXT:    andl %eax, %edi
 ; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %eax
 ; X86-NOBMI-NEXT:    calll use64
 ; X86-NOBMI-NEXT:    addl $16, %esp
-; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:    movl %edi, %edx
+; X86-NOBMI-NEXT:    movl %edi, %eax
+; X86-NOBMI-NEXT:    movl %esi, %edx
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    popl %ebx
@@ -3132,24 +3120,23 @@ define i64 @bzhi64_c3_load_indexzext(i64* %w, i8 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB37_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %eax
 ; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1NOTBM-NEXT:  .LBB37_2:
-; X86-BMI1NOTBM-NEXT:    movl (%edx), %esi
-; X86-BMI1NOTBM-NEXT:    andl %eax, %esi
-; X86-BMI1NOTBM-NEXT:    movl 4(%edx), %edi
-; X86-BMI1NOTBM-NEXT:    andl %ebx, %edi
+; X86-BMI1NOTBM-NEXT:    movl 4(%edx), %esi
+; X86-BMI1NOTBM-NEXT:    andl %ebx, %esi
+; X86-BMI1NOTBM-NEXT:    movl (%edx), %edi
+; X86-BMI1NOTBM-NEXT:    andl %eax, %edi
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    pushl %ebx
 ; X86-BMI1NOTBM-NEXT:    pushl %eax
 ; X86-BMI1NOTBM-NEXT:    calll use64
 ; X86-BMI1NOTBM-NEXT:    addl $16, %esp
-; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
-; X86-BMI1NOTBM-NEXT:    movl %edi, %edx
+; X86-BMI1NOTBM-NEXT:    movl %edi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %esi, %edx
 ; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    popl %ebx
@@ -3160,29 +3147,28 @@ define i64 @bzhi64_c3_load_indexzext(i64* %w, i8 %numlowbits) nounwind {
 ; X86-BMI1BMI2-NEXT:    pushl %ebx
 ; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1BMI2-NEXT:    movb $64, %cl
-; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %ebx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %eax
-; X86-BMI1BMI2-NEXT:    testb $32, %cl
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1BMI2-NEXT:    movb $64, %bl
+; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %bl
+; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI1BMI2-NEXT:    shrxl %ebx, %ecx, %edx
+; X86-BMI1BMI2-NEXT:    testb $32, %bl
 ; X86-BMI1BMI2-NEXT:    je .LBB37_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %ebx, %eax
-; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
+; X86-BMI1BMI2-NEXT:    movl %edx, %ecx
+; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:  .LBB37_2:
-; X86-BMI1BMI2-NEXT:    movl (%edx), %esi
-; X86-BMI1BMI2-NEXT:    andl %eax, %esi
-; X86-BMI1BMI2-NEXT:    movl 4(%edx), %edi
-; X86-BMI1BMI2-NEXT:    andl %ebx, %edi
+; X86-BMI1BMI2-NEXT:    movl 4(%eax), %esi
+; X86-BMI1BMI2-NEXT:    andl %edx, %esi
+; X86-BMI1BMI2-NEXT:    movl (%eax), %edi
+; X86-BMI1BMI2-NEXT:    andl %ecx, %edi
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
-; X86-BMI1BMI2-NEXT:    pushl %ebx
-; X86-BMI1BMI2-NEXT:    pushl %eax
+; X86-BMI1BMI2-NEXT:    pushl %edx
+; X86-BMI1BMI2-NEXT:    pushl %ecx
 ; X86-BMI1BMI2-NEXT:    calll use64
 ; X86-BMI1BMI2-NEXT:    addl $16, %esp
-; X86-BMI1BMI2-NEXT:    movl %esi, %eax
-; X86-BMI1BMI2-NEXT:    movl %edi, %edx
+; X86-BMI1BMI2-NEXT:    movl %edi, %eax
+; X86-BMI1BMI2-NEXT:    movl %esi, %edx
 ; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    popl %ebx
@@ -3253,7 +3239,6 @@ define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    movl $-1, %edi
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
-; X86-NOBMI-NEXT:    shrdl %cl, %esi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB38_2
 ; X86-NOBMI-NEXT:  # %bb.1:
@@ -3284,7 +3269,6 @@ define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %esi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB38_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
@@ -3310,26 +3294,25 @@ define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
 ; X86-BMI1BMI2-NEXT:    pushl %eax
-; X86-BMI1BMI2-NEXT:    movb $64, %cl
-; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    movl $-1, %esi
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %edi
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %esi
-; X86-BMI1BMI2-NEXT:    testb $32, %cl
+; X86-BMI1BMI2-NEXT:    movb $64, %al
+; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
+; X86-BMI1BMI2-NEXT:    movl $-1, %edi
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %edi, %esi
+; X86-BMI1BMI2-NEXT:    testb $32, %al
 ; X86-BMI1BMI2-NEXT:    je .LBB38_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %edi, %esi
-; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %edi
+; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
 ; X86-BMI1BMI2-NEXT:  .LBB38_2:
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
-; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    calll use64
 ; X86-BMI1BMI2-NEXT:    addl $16, %esp
-; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edi
-; X86-BMI1BMI2-NEXT:    movl %esi, %eax
-; X86-BMI1BMI2-NEXT:    movl %edi, %edx
+; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movl %edi, %eax
+; X86-BMI1BMI2-NEXT:    movl %esi, %edx
 ; X86-BMI1BMI2-NEXT:    addl $4, %esp
 ; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    popl %edi
@@ -3407,14 +3390,12 @@ define i32 @bzhi64_32_c0(i64 %val, i64 %numlowbits) nounwind {
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl $-1, %edx
 ; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
-; X86-NOBMI-NEXT:    shrdl %cl, %edx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    jne .LBB39_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edx, %eax
+; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:  .LBB39_2:
 ; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    retl
@@ -3423,14 +3404,12 @@ define i32 @bzhi64_32_c0(i64 %val, i64 %numlowbits) nounwind {
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movb $64, %cl
 ; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %edx, %edx
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    jne .LBB39_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
-; X86-BMI1NOTBM-NEXT:    movl %edx, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
 ; X86-BMI1NOTBM-NEXT:  .LBB39_2:
 ; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
@@ -3439,13 +3418,11 @@ define i32 @bzhi64_32_c0(i64 %val, i64 %numlowbits) nounwind {
 ; X86-BMI1BMI2:       # %bb.0:
 ; X86-BMI1BMI2-NEXT:    movb $64, %cl
 ; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    movl $-1, %edx
 ; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %eax
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB39_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edx, %eax
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %eax
 ; X86-BMI1BMI2-NEXT:  .LBB39_2:
 ; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    retl

diff  --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
index 465dea578267..335e64f99c10 100644
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -587,14 +587,9 @@ define i32 @combine_fshl_load_i32(i32* %p) nounwind {
 define i64 @combine_fshl_load_i64(i64* %p) nounwind {
 ; X86-FAST-LABEL: combine_fshl_load_i64:
 ; X86-FAST:       # %bb.0:
-; X86-FAST-NEXT:    pushl %esi
 ; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-FAST-NEXT:    movl 12(%ecx), %eax
-; X86-FAST-NEXT:    movl 16(%ecx), %esi
-; X86-FAST-NEXT:    movl 20(%ecx), %edx
-; X86-FAST-NEXT:    shldl $24, %esi, %edx
-; X86-FAST-NEXT:    shrdl $8, %esi, %eax
-; X86-FAST-NEXT:    popl %esi
+; X86-FAST-NEXT:    movl 13(%ecx), %eax
+; X86-FAST-NEXT:    movl 17(%ecx), %edx
 ; X86-FAST-NEXT:    retl
 ;
 ; X86-SLOW-LABEL: combine_fshl_load_i64:

diff  --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
index 644e33fe198c..2238ff4a3101 100644
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -582,16 +582,9 @@ define i32 @combine_fshr_load_i32(i32* %p) nounwind {
 define i64 @combine_fshr_load_i64(i64* %p) nounwind {
 ; X86-FAST-LABEL: combine_fshr_load_i64:
 ; X86-FAST:       # %bb.0:
-; X86-FAST-NEXT:    pushl %esi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT:    movzbl 11(%eax), %ecx
-; X86-FAST-NEXT:    movl 12(%eax), %esi
-; X86-FAST-NEXT:    movl 16(%eax), %edx
-; X86-FAST-NEXT:    shldl $8, %esi, %edx
-; X86-FAST-NEXT:    movl %esi, %eax
-; X86-FAST-NEXT:    shll $8, %eax
-; X86-FAST-NEXT:    orl %ecx, %eax
-; X86-FAST-NEXT:    popl %esi
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-NEXT:    movl 11(%ecx), %eax
+; X86-FAST-NEXT:    movl 15(%ecx), %edx
 ; X86-FAST-NEXT:    retl
 ;
 ; X86-SLOW-LABEL: combine_fshr_load_i64:

diff  --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll
index 59fe62c0e4b4..f5673c7e8bd3 100644
--- a/llvm/test/CodeGen/X86/shift-combine.ll
+++ b/llvm/test/CodeGen/X86/shift-combine.ll
@@ -290,7 +290,6 @@ define i64 @ashr_add_shl_mismatch_shifts2(i64 %r) nounwind {
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    shrdl $8, %edx, %eax
 ; X32-NEXT:    shrl $8, %edx
 ; X32-NEXT:    incl %edx
 ; X32-NEXT:    shrdl $8, %edx, %eax

diff  --git a/llvm/test/CodeGen/X86/shift-parts.ll b/llvm/test/CodeGen/X86/shift-parts.ll
index 2dc35f6bfef0..da00f377020d 100644
--- a/llvm/test/CodeGen/X86/shift-parts.ll
+++ b/llvm/test/CodeGen/X86/shift-parts.ll
@@ -10,15 +10,14 @@ define i32 @int87(i32 %uint64p_8, i1 %cond) nounwind {
 ; CHECK-LABEL: int87:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq g_144+{{.*}}(%rip), %rax
-; CHECK-NEXT:    movq g_144+{{.*}}(%rip), %rdx
-; CHECK-NEXT:    movzbl %sil, %ecx
-; CHECK-NEXT:    shll $6, %ecx
+; CHECK-NEXT:    movq g_144+{{.*}}(%rip), %rcx
+; CHECK-NEXT:    movzbl %sil, %edx
+; CHECK-NEXT:    shll $6, %edx
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_1: # %for.cond
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movq %rdx, %rsi
-; CHECK-NEXT:    shrdq %cl, %rax, %rsi
-; CHECK-NEXT:    testb $64, %cl
+; CHECK-NEXT:    testb $64, %dl
+; CHECK-NEXT:    movq %rcx, %rsi
 ; CHECK-NEXT:    cmovneq %rax, %rsi
 ; CHECK-NEXT:    orl $0, %esi
 ; CHECK-NEXT:    je .LBB0_1


        


More information about the llvm-commits mailing list