[llvm] 1794143 - [TargetLowering] Improve expansion of FSHL/FSHR

Thu May 14 08:36:36 PDT 2020

Author: Jay Foad
Date: 2020-05-14T16:36:22+01:00
New Revision: 17941437a2ed8abefef719345391da94e6df8ebb

URL: https://github.com/llvm/llvm-project/commit/17941437a2ed8abefef719345391da94e6df8ebb
DIFF: https://github.com/llvm/llvm-project/commit/17941437a2ed8abefef719345391da94e6df8ebb.diff

LOG: [TargetLowering] Improve expansion of FSHL/FSHR

Use an extra shift-by-1 instead of a compare and select to handle the
shift-by-zero case. This sometimes saves one instruction (if the compare
couldn't be combined with a previous instruction). It also works better
on targets that don't have good select instructions.

Note that currently this change doesn't affect most targets because
expandFunnelShift is not used because funnel shift intrinsics are
lowered early in SelectionDAGBuilder. But there is work afoot to change
that; see D77152.

Differential Revision: https://reviews.llvm.org/D77301

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/test/CodeGen/X86/fshl.ll
    llvm/test/CodeGen/X86/fshr.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e0f7040165d5..90f9e17240a3 100644

--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6046,8 +6046,8 @@ bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result,
                         !isOperationLegalOrCustomOrPromote(ISD::OR, VT)))
     return false;
 
-  // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
-  // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
+  // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
+  // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
   SDValue X = Node->getOperand(0);
   SDValue Y = Node->getOperand(1);
   SDValue Z = Node->getOperand(2);
@@ -6057,30 +6057,29 @@ bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result,
   SDLoc DL(SDValue(Node, 0));
 
   EVT ShVT = Z.getValueType();
-  SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT);
-  SDValue Zero = DAG.getConstant(0, DL, ShVT);
-
+  SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, ShVT);
   SDValue ShAmt;
   if (isPowerOf2_32(EltSizeInBits)) {
-    SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, ShVT);
+    // Z % BW -> Z & (BW - 1)
     ShAmt = DAG.getNode(ISD::AND, DL, ShVT, Z, Mask);
   } else {
+    SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT);
     ShAmt = DAG.getNode(ISD::UREM, DL, ShVT, Z, BitWidthC);
   }
+  SDValue InvShAmt = DAG.getNode(ISD::SUB, DL, ShVT, Mask, ShAmt);
 
-  SDValue InvShAmt = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, ShAmt);
-  SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, X, IsFSHL ? ShAmt : InvShAmt);
-  SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Y, IsFSHL ? InvShAmt : ShAmt);
-  SDValue Or = DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
-
-  // If (Z % BW == 0), then the opposite direction shift is shift-by-bitwidth,
-  // and that is undefined. We must compare and select to avoid UB.
-  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ShVT);
-
-  // For fshl, 0-shift returns the 1st arg (X).
-  // For fshr, 0-shift returns the 2nd arg (Y).
-  SDValue IsZeroShift = DAG.getSetCC(DL, CCVT, ShAmt, Zero, ISD::SETEQ);
-  Result = DAG.getSelect(DL, VT, IsZeroShift, IsFSHL ? X : Y, Or);
+  SDValue One = DAG.getConstant(1, DL, ShVT);
+  SDValue ShX, ShY;
+  if (IsFSHL) {
+    ShX = DAG.getNode(ISD::SHL, DL, VT, X, ShAmt);
+    SDValue ShY1 = DAG.getNode(ISD::SRL, DL, VT, Y, One);
+    ShY = DAG.getNode(ISD::SRL, DL, VT, ShY1, InvShAmt);
+  } else {
+    SDValue ShX1 = DAG.getNode(ISD::SHL, DL, VT, X, One);
+    ShX = DAG.getNode(ISD::SHL, DL, VT, ShX1, InvShAmt);
+    ShY = DAG.getNode(ISD::SRL, DL, VT, Y, ShAmt);
+  }
+  Result = DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
   return true;
 }
 

diff  --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
index 1da714067aa4..99f5a3e923bb 100644
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -65,27 +65,16 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
 ;
 ; X86-SLOW-LABEL: var_shift_i16:
 ; X86-SLOW:       # %bb.0:
-; X86-SLOW-NEXT:    pushl %edi
-; X86-SLOW-NEXT:    pushl %esi
-; X86-SLOW-NEXT:    movzwl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-SLOW-NEXT:    andb $15, %dl
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT:    movl %eax, %edi
-; X86-SLOW-NEXT:    movl %edx, %ecx
-; X86-SLOW-NEXT:    shll %cl, %edi
-; X86-SLOW-NEXT:    movb $16, %cl
-; X86-SLOW-NEXT:    subb %dl, %cl
-; X86-SLOW-NEXT:    shrl %cl, %esi
-; X86-SLOW-NEXT:    testb %dl, %dl
-; X86-SLOW-NEXT:    je .LBB1_2
-; X86-SLOW-NEXT:  # %bb.1:
-; X86-SLOW-NEXT:    orl %esi, %edi
-; X86-SLOW-NEXT:    movl %edi, %eax
-; X86-SLOW-NEXT:  .LBB1_2:
+; X86-SLOW-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-SLOW-NEXT:    andb $15, %cl
+; X86-SLOW-NEXT:    shll %cl, %edx
+; X86-SLOW-NEXT:    shrl %eax
+; X86-SLOW-NEXT:    xorb $15, %cl
+; X86-SLOW-NEXT:    shrl %cl, %eax
+; X86-SLOW-NEXT:    orl %edx, %eax
 ; X86-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-SLOW-NEXT:    popl %esi
-; X86-SLOW-NEXT:    popl %edi
 ; X86-SLOW-NEXT:    retl
 ;
 ; X64-FAST-LABEL: var_shift_i16:
@@ -100,17 +89,15 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
 ;
 ; X64-SLOW-LABEL: var_shift_i16:
 ; X64-SLOW:       # %bb.0:
-; X64-SLOW-NEXT:    movzwl %si, %eax
-; X64-SLOW-NEXT:    andb $15, %dl
-; X64-SLOW-NEXT:    movl %edi, %esi
 ; X64-SLOW-NEXT:    movl %edx, %ecx
-; X64-SLOW-NEXT:    shll %cl, %esi
-; X64-SLOW-NEXT:    movb $16, %cl
-; X64-SLOW-NEXT:    subb %dl, %cl
+; X64-SLOW-NEXT:    movzwl %si, %eax
+; X64-SLOW-NEXT:    andb $15, %cl
+; X64-SLOW-NEXT:    shll %cl, %edi
+; X64-SLOW-NEXT:    xorb $15, %cl
+; X64-SLOW-NEXT:    shrl %eax
+; X64-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-SLOW-NEXT:    shrl %cl, %eax
-; X64-SLOW-NEXT:    orl %esi, %eax
-; X64-SLOW-NEXT:    testb %dl, %dl
-; X64-SLOW-NEXT:    cmovel %edi, %eax
+; X64-SLOW-NEXT:    orl %edi, %eax
 ; X64-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-SLOW-NEXT:    retq
   %tmp = tail call i16 @llvm.fshl.i16(i16 %x, i16 %y, i16 %z)
@@ -128,26 +115,15 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
 ;
 ; X86-SLOW-LABEL: var_shift_i32:
 ; X86-SLOW:       # %bb.0:
-; X86-SLOW-NEXT:    pushl %edi
-; X86-SLOW-NEXT:    pushl %esi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    movb {{[0-9]+}}(%esp), %dl
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT:    movl %eax, %edi
-; X86-SLOW-NEXT:    movl %edx, %ecx
-; X86-SLOW-NEXT:    shll %cl, %edi
-; X86-SLOW-NEXT:    andb $31, %dl
-; X86-SLOW-NEXT:    movl %edx, %ecx
-; X86-SLOW-NEXT:    negb %cl
-; X86-SLOW-NEXT:    shrl %cl, %esi
-; X86-SLOW-NEXT:    testb %dl, %dl
-; X86-SLOW-NEXT:    je .LBB2_2
-; X86-SLOW-NEXT:  # %bb.1:
-; X86-SLOW-NEXT:    orl %esi, %edi
-; X86-SLOW-NEXT:    movl %edi, %eax
-; X86-SLOW-NEXT:  .LBB2_2:
-; X86-SLOW-NEXT:    popl %esi
-; X86-SLOW-NEXT:    popl %edi
+; X86-SLOW-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT:    shll %cl, %edx
+; X86-SLOW-NEXT:    shrl %eax
+; X86-SLOW-NEXT:    andb $31, %cl
+; X86-SLOW-NEXT:    xorb $31, %cl
+; X86-SLOW-NEXT:    shrl %cl, %eax
+; X86-SLOW-NEXT:    orl %edx, %eax
 ; X86-SLOW-NEXT:    retl
 ;
 ; X64-FAST-LABEL: var_shift_i32:
@@ -160,17 +136,15 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
 ;
 ; X64-SLOW-LABEL: var_shift_i32:
 ; X64-SLOW:       # %bb.0:
-; X64-SLOW-NEXT:    movl %esi, %eax
-; X64-SLOW-NEXT:    movl %edi, %esi
 ; X64-SLOW-NEXT:    movl %edx, %ecx
-; X64-SLOW-NEXT:    shll %cl, %esi
-; X64-SLOW-NEXT:    andb $31, %dl
-; X64-SLOW-NEXT:    movl %edx, %ecx
-; X64-SLOW-NEXT:    negb %cl
+; X64-SLOW-NEXT:    movl %esi, %eax
+; X64-SLOW-NEXT:    shll %cl, %edi
+; X64-SLOW-NEXT:    shrl %eax
+; X64-SLOW-NEXT:    andb $31, %cl
+; X64-SLOW-NEXT:    xorb $31, %cl
+; X64-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-SLOW-NEXT:    shrl %cl, %eax
-; X64-SLOW-NEXT:    orl %esi, %eax
-; X64-SLOW-NEXT:    testb %dl, %dl
-; X64-SLOW-NEXT:    cmovel %edi, %eax
+; X64-SLOW-NEXT:    orl %edi, %eax
 ; X64-SLOW-NEXT:    retq
   %tmp = tail call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
   ret i32 %tmp
@@ -279,78 +253,61 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-SLOW-NEXT:    pushl %ebx
 ; X86-SLOW-NEXT:    pushl %edi
 ; X86-SLOW-NEXT:    pushl %esi
-; X86-SLOW-NEXT:    subl $8, %esp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-SLOW-NEXT:    andl $63, %ebx
-; X86-SLOW-NEXT:    movb $64, %dh
-; X86-SLOW-NEXT:    subb %bl, %dh
-; X86-SLOW-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:    movb %dh, %cl
-; X86-SLOW-NEXT:    shrl %cl, %eax
-; X86-SLOW-NEXT:    movb %dh, %dl
-; X86-SLOW-NEXT:    andb $31, %dl
-; X86-SLOW-NEXT:    movl %edx, %ecx
-; X86-SLOW-NEXT:    negb %cl
-; X86-SLOW-NEXT:    movl %esi, %ebp
-; X86-SLOW-NEXT:    shll %cl, %ebp
-; X86-SLOW-NEXT:    testb %dl, %dl
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLOW-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    je .LBB5_2
-; X86-SLOW-NEXT:  # %bb.1:
-; X86-SLOW-NEXT:    orl %eax, %ebp
-; X86-SLOW-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:  .LBB5_2:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-SLOW-NEXT:    movl %ebp, %eax
-; X86-SLOW-NEXT:    movl %ebx, %ecx
-; X86-SLOW-NEXT:    shll %cl, %eax
-; X86-SLOW-NEXT:    movb %bl, %ch
-; X86-SLOW-NEXT:    andb $31, %ch
+; X86-SLOW-NEXT:    movb $64, %ch
+; X86-SLOW-NEXT:    subb %bl, %ch
 ; X86-SLOW-NEXT:    movb %ch, %cl
-; X86-SLOW-NEXT:    negb %cl
+; X86-SLOW-NEXT:    shrl %cl, %edx
+; X86-SLOW-NEXT:    andb $31, %cl
+; X86-SLOW-NEXT:    xorb $31, %cl
+; X86-SLOW-NEXT:    addl %eax, %eax
+; X86-SLOW-NEXT:    shll %cl, %eax
+; X86-SLOW-NEXT:    movb %bl, %cl
+; X86-SLOW-NEXT:    shll %cl, %ebp
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT:    movl %esi, %edi
+; X86-SLOW-NEXT:    shrl %edi
+; X86-SLOW-NEXT:    andb $31, %cl
+; X86-SLOW-NEXT:    xorb $31, %cl
 ; X86-SLOW-NEXT:    shrl %cl, %edi
-; X86-SLOW-NEXT:    testb %ch, %ch
-; X86-SLOW-NEXT:    je .LBB5_4
-; X86-SLOW-NEXT:  # %bb.3:
-; X86-SLOW-NEXT:    orl %edi, %eax
-; X86-SLOW-NEXT:    movl %eax, %ebp
-; X86-SLOW-NEXT:  .LBB5_4:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT:    movl %eax, %edi
-; X86-SLOW-NEXT:    movl %ebx, %ecx
-; X86-SLOW-NEXT:    shll %cl, %edi
+; X86-SLOW-NEXT:    movb %bl, %cl
+; X86-SLOW-NEXT:    shll %cl, %esi
 ; X86-SLOW-NEXT:    testb $32, %bl
-; X86-SLOW-NEXT:    je .LBB5_6
+; X86-SLOW-NEXT:    jne .LBB5_1
+; X86-SLOW-NEXT:  # %bb.2:
+; X86-SLOW-NEXT:    orl %edi, %ebp
+; X86-SLOW-NEXT:    jmp .LBB5_3
+; X86-SLOW-NEXT:  .LBB5_1:
+; X86-SLOW-NEXT:    movl %esi, %ebp
+; X86-SLOW-NEXT:    xorl %esi, %esi
+; X86-SLOW-NEXT:  .LBB5_3:
+; X86-SLOW-NEXT:    movb %ch, %cl
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SLOW-NEXT:    shrl %cl, %edi
+; X86-SLOW-NEXT:    testb $32, %ch
+; X86-SLOW-NEXT:    jne .LBB5_4
 ; X86-SLOW-NEXT:  # %bb.5:
-; X86-SLOW-NEXT:    movl %edi, %ebp
+; X86-SLOW-NEXT:    orl %edx, %eax
+; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    jmp .LBB5_6
+; X86-SLOW-NEXT:  .LBB5_4:
+; X86-SLOW-NEXT:    movl %edi, %ecx
 ; X86-SLOW-NEXT:    xorl %edi, %edi
 ; X86-SLOW-NEXT:  .LBB5_6:
-; X86-SLOW-NEXT:    movb %dh, %cl
-; X86-SLOW-NEXT:    shrl %cl, %esi
-; X86-SLOW-NEXT:    testb $32, %dh
-; X86-SLOW-NEXT:    jne .LBB5_7
-; X86-SLOW-NEXT:  # %bb.8:
-; X86-SLOW-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-SLOW-NEXT:    testl %ebx, %ebx
-; X86-SLOW-NEXT:    jne .LBB5_10
-; X86-SLOW-NEXT:    jmp .LBB5_11
-; X86-SLOW-NEXT:  .LBB5_7:
-; X86-SLOW-NEXT:    movl %esi, %ecx
-; X86-SLOW-NEXT:    xorl %esi, %esi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SLOW-NEXT:    testl %ebx, %ebx
-; X86-SLOW-NEXT:    je .LBB5_11
-; X86-SLOW-NEXT:  .LBB5_10:
-; X86-SLOW-NEXT:    orl %esi, %ebp
-; X86-SLOW-NEXT:    orl %ecx, %edi
-; X86-SLOW-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %edi, %eax
-; X86-SLOW-NEXT:  .LBB5_11:
-; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-SLOW-NEXT:    addl $8, %esp
+; X86-SLOW-NEXT:    je .LBB5_8
+; X86-SLOW-NEXT:  # %bb.7:
+; X86-SLOW-NEXT:    orl %edi, %ebp
+; X86-SLOW-NEXT:    orl %ecx, %esi
+; X86-SLOW-NEXT:    movl %ebp, %edx
+; X86-SLOW-NEXT:    movl %esi, %eax
+; X86-SLOW-NEXT:  .LBB5_8:
 ; X86-SLOW-NEXT:    popl %esi
 ; X86-SLOW-NEXT:    popl %edi
 ; X86-SLOW-NEXT:    popl %ebx
@@ -367,17 +324,15 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ;
 ; X64-SLOW-LABEL: var_shift_i64:
 ; X64-SLOW:       # %bb.0:
+; X64-SLOW-NEXT:    movq %rdx, %rcx
 ; X64-SLOW-NEXT:    movq %rsi, %rax
-; X64-SLOW-NEXT:    movq %rdi, %rsi
-; X64-SLOW-NEXT:    movl %edx, %ecx
-; X64-SLOW-NEXT:    shlq %cl, %rsi
-; X64-SLOW-NEXT:    andb $63, %dl
-; X64-SLOW-NEXT:    movl %edx, %ecx
-; X64-SLOW-NEXT:    negb %cl
+; X64-SLOW-NEXT:    shlq %cl, %rdi
+; X64-SLOW-NEXT:    shrq %rax
+; X64-SLOW-NEXT:    andb $63, %cl
+; X64-SLOW-NEXT:    xorb $63, %cl
+; X64-SLOW-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-SLOW-NEXT:    shrq %cl, %rax
-; X64-SLOW-NEXT:    orq %rsi, %rax
-; X64-SLOW-NEXT:    testb %dl, %dl
-; X64-SLOW-NEXT:    cmoveq %rdi, %rax
+; X64-SLOW-NEXT:    orq %rdi, %rax
 ; X64-SLOW-NEXT:    retq
   %tmp = tail call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 %z)
   ret i64 %tmp

diff  --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
index 06fe8dfc2413..6e18c13fecb1 100644
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -65,27 +65,16 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
 ;
 ; X86-SLOW-LABEL: var_shift_i16:
 ; X86-SLOW:       # %bb.0:
-; X86-SLOW-NEXT:    pushl %edi
-; X86-SLOW-NEXT:    pushl %esi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-SLOW-NEXT:    andb $15, %dl
-; X86-SLOW-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT:    movl %eax, %edi
-; X86-SLOW-NEXT:    movl %edx, %ecx
-; X86-SLOW-NEXT:    shrl %cl, %edi
-; X86-SLOW-NEXT:    movb $16, %cl
-; X86-SLOW-NEXT:    subb %dl, %cl
-; X86-SLOW-NEXT:    shll %cl, %esi
-; X86-SLOW-NEXT:    testb %dl, %dl
-; X86-SLOW-NEXT:    je .LBB1_2
-; X86-SLOW-NEXT:  # %bb.1:
-; X86-SLOW-NEXT:    orl %edi, %esi
-; X86-SLOW-NEXT:    movl %esi, %eax
-; X86-SLOW-NEXT:  .LBB1_2:
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-SLOW-NEXT:    andb $15, %cl
+; X86-SLOW-NEXT:    shrl %cl, %edx
+; X86-SLOW-NEXT:    addl %eax, %eax
+; X86-SLOW-NEXT:    xorb $15, %cl
+; X86-SLOW-NEXT:    shll %cl, %eax
+; X86-SLOW-NEXT:    orl %edx, %eax
 ; X86-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-SLOW-NEXT:    popl %esi
-; X86-SLOW-NEXT:    popl %edi
 ; X86-SLOW-NEXT:    retl
 ;
 ; X64-FAST-LABEL: var_shift_i16:
@@ -100,16 +89,16 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
 ;
 ; X64-SLOW-LABEL: var_shift_i16:
 ; X64-SLOW:       # %bb.0:
-; X64-SLOW-NEXT:    movzwl %si, %eax
-; X64-SLOW-NEXT:    andb $15, %dl
 ; X64-SLOW-NEXT:    movl %edx, %ecx
-; X64-SLOW-NEXT:    shrl %cl, %eax
-; X64-SLOW-NEXT:    movb $16, %cl
-; X64-SLOW-NEXT:    subb %dl, %cl
-; X64-SLOW-NEXT:    shll %cl, %edi
-; X64-SLOW-NEXT:    orl %edi, %eax
-; X64-SLOW-NEXT:    testb %dl, %dl
-; X64-SLOW-NEXT:    cmovel %esi, %eax
+; X64-SLOW-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-SLOW-NEXT:    movzwl %si, %edx
+; X64-SLOW-NEXT:    andb $15, %cl
+; X64-SLOW-NEXT:    shrl %cl, %edx
+; X64-SLOW-NEXT:    leal (%rdi,%rdi), %eax
+; X64-SLOW-NEXT:    xorb $15, %cl
+; X64-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-SLOW-NEXT:    shll %cl, %eax
+; X64-SLOW-NEXT:    orl %edx, %eax
 ; X64-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-SLOW-NEXT:    retq
   %tmp = tail call i16 @llvm.fshr.i16(i16 %x, i16 %y, i16 %z)
@@ -127,26 +116,15 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
 ;
 ; X86-SLOW-LABEL: var_shift_i32:
 ; X86-SLOW:       # %bb.0:
-; X86-SLOW-NEXT:    pushl %edi
-; X86-SLOW-NEXT:    pushl %esi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    movb {{[0-9]+}}(%esp), %dl
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT:    movl %eax, %edi
-; X86-SLOW-NEXT:    movl %edx, %ecx
-; X86-SLOW-NEXT:    shrl %cl, %edi
-; X86-SLOW-NEXT:    andb $31, %dl
-; X86-SLOW-NEXT:    movl %edx, %ecx
-; X86-SLOW-NEXT:    negb %cl
-; X86-SLOW-NEXT:    shll %cl, %esi
-; X86-SLOW-NEXT:    testb %dl, %dl
-; X86-SLOW-NEXT:    je .LBB2_2
-; X86-SLOW-NEXT:  # %bb.1:
-; X86-SLOW-NEXT:    orl %edi, %esi
-; X86-SLOW-NEXT:    movl %esi, %eax
-; X86-SLOW-NEXT:  .LBB2_2:
-; X86-SLOW-NEXT:    popl %esi
-; X86-SLOW-NEXT:    popl %edi
+; X86-SLOW-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT:    shrl %cl, %edx
+; X86-SLOW-NEXT:    addl %eax, %eax
+; X86-SLOW-NEXT:    andb $31, %cl
+; X86-SLOW-NEXT:    xorb $31, %cl
+; X86-SLOW-NEXT:    shll %cl, %eax
+; X86-SLOW-NEXT:    orl %edx, %eax
 ; X86-SLOW-NEXT:    retl
 ;
 ; X64-FAST-LABEL: var_shift_i32:
@@ -159,17 +137,15 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
 ;
 ; X64-SLOW-LABEL: var_shift_i32:
 ; X64-SLOW:       # %bb.0:
-; X64-SLOW-NEXT:    movl %edi, %eax
-; X64-SLOW-NEXT:    movl %esi, %edi
-; X64-SLOW-NEXT:    movl %edx, %ecx
-; X64-SLOW-NEXT:    shrl %cl, %edi
-; X64-SLOW-NEXT:    andb $31, %dl
 ; X64-SLOW-NEXT:    movl %edx, %ecx
-; X64-SLOW-NEXT:    negb %cl
+; X64-SLOW-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-SLOW-NEXT:    shrl %cl, %esi
+; X64-SLOW-NEXT:    leal (%rdi,%rdi), %eax
+; X64-SLOW-NEXT:    andb $31, %cl
+; X64-SLOW-NEXT:    xorb $31, %cl
+; X64-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-SLOW-NEXT:    shll %cl, %eax
-; X64-SLOW-NEXT:    orl %edi, %eax
-; X64-SLOW-NEXT:    testb %dl, %dl
-; X64-SLOW-NEXT:    cmovel %esi, %eax
+; X64-SLOW-NEXT:    orl %esi, %eax
 ; X64-SLOW-NEXT:    retq
   %tmp = tail call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
   ret i32 %tmp
@@ -276,76 +252,61 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-SLOW-NEXT:    pushl %esi
 ; X86-SLOW-NEXT:    subl $8, %esp
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-SLOW-NEXT:    andl $63, %ebx
-; X86-SLOW-NEXT:    movb $64, %al
-; X86-SLOW-NEXT:    subb %bl, %al
-; X86-SLOW-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %eax, %ecx
-; X86-SLOW-NEXT:    shll %cl, %edx
-; X86-SLOW-NEXT:    movb %al, %ch
-; X86-SLOW-NEXT:    andb $31, %ch
+; X86-SLOW-NEXT:    movb $64, %ch
+; X86-SLOW-NEXT:    subb %bl, %ch
 ; X86-SLOW-NEXT:    movb %ch, %cl
-; X86-SLOW-NEXT:    negb %cl
-; X86-SLOW-NEXT:    movl %esi, %edi
-; X86-SLOW-NEXT:    shrl %cl, %edi
-; X86-SLOW-NEXT:    testb %ch, %ch
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-SLOW-NEXT:    je .LBB5_2
-; X86-SLOW-NEXT:  # %bb.1:
-; X86-SLOW-NEXT:    orl %edi, %edx
-; X86-SLOW-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:  .LBB5_2:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT:    movl %ebx, %ecx
-; X86-SLOW-NEXT:    shrl %cl, %edx
-; X86-SLOW-NEXT:    movb %bl, %ah
-; X86-SLOW-NEXT:    andb $31, %ah
-; X86-SLOW-NEXT:    movb %ah, %cl
-; X86-SLOW-NEXT:    negb %cl
-; X86-SLOW-NEXT:    movl %ebp, %edi
-; X86-SLOW-NEXT:    shll %cl, %edi
-; X86-SLOW-NEXT:    testb %ah, %ah
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-SLOW-NEXT:    je .LBB5_4
-; X86-SLOW-NEXT:  # %bb.3:
-; X86-SLOW-NEXT:    orl %edx, %edi
-; X86-SLOW-NEXT:    movl %edi, %ebp
-; X86-SLOW-NEXT:  .LBB5_4:
+; X86-SLOW-NEXT:    shll %cl, %edx
+; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %esi, %edx
+; X86-SLOW-NEXT:    andb $31, %cl
+; X86-SLOW-NEXT:    xorb $31, %cl
+; X86-SLOW-NEXT:    shrl %esi
+; X86-SLOW-NEXT:    shrl %cl, %esi
+; X86-SLOW-NEXT:    movb %bl, %cl
+; X86-SLOW-NEXT:    shrl %cl, %eax
+; X86-SLOW-NEXT:    andb $31, %cl
+; X86-SLOW-NEXT:    xorb $31, %cl
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    leal (%edi,%edi), %ebp
+; X86-SLOW-NEXT:    shll %cl, %ebp
+; X86-SLOW-NEXT:    movb %bl, %cl
 ; X86-SLOW-NEXT:    shrl %cl, %edi
 ; X86-SLOW-NEXT:    testb $32, %bl
-; X86-SLOW-NEXT:    je .LBB5_6
-; X86-SLOW-NEXT:  # %bb.5:
+; X86-SLOW-NEXT:    jne .LBB5_1
+; X86-SLOW-NEXT:  # %bb.2:
+; X86-SLOW-NEXT:    orl %eax, %ebp
+; X86-SLOW-NEXT:    jmp .LBB5_3
+; X86-SLOW-NEXT:  .LBB5_1:
 ; X86-SLOW-NEXT:    movl %edi, %ebp
 ; X86-SLOW-NEXT:    xorl %edi, %edi
+; X86-SLOW-NEXT:  .LBB5_3:
+; X86-SLOW-NEXT:    movb %ch, %cl
+; X86-SLOW-NEXT:    shll %cl, %edx
+; X86-SLOW-NEXT:    testb $32, %ch
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    jne .LBB5_4
+; X86-SLOW-NEXT:  # %bb.5:
+; X86-SLOW-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SLOW-NEXT:    orl %esi, %ecx
+; X86-SLOW-NEXT:    jmp .LBB5_6
+; X86-SLOW-NEXT:  .LBB5_4:
+; X86-SLOW-NEXT:    movl %edx, %ecx
+; X86-SLOW-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
 ; X86-SLOW-NEXT:  .LBB5_6:
-; X86-SLOW-NEXT:    movl %eax, %ecx
-; X86-SLOW-NEXT:    shll %cl, %esi
-; X86-SLOW-NEXT:    testb $32, %al
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT:    jne .LBB5_7
-; X86-SLOW-NEXT:  # %bb.8:
-; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-SLOW-NEXT:    testl %ebx, %ebx
-; X86-SLOW-NEXT:    jne .LBB5_10
-; X86-SLOW-NEXT:    jmp .LBB5_11
-; X86-SLOW-NEXT:  .LBB5_7:
-; X86-SLOW-NEXT:    movl %esi, %eax
-; X86-SLOW-NEXT:    xorl %esi, %esi
 ; X86-SLOW-NEXT:    testl %ebx, %ebx
-; X86-SLOW-NEXT:    je .LBB5_11
-; X86-SLOW-NEXT:  .LBB5_10:
-; X86-SLOW-NEXT:    orl %ebp, %esi
-; X86-SLOW-NEXT:    orl %edi, %eax
-; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %eax, %edx
-; X86-SLOW-NEXT:  .LBB5_11:
-; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SLOW-NEXT:    je .LBB5_8
+; X86-SLOW-NEXT:  # %bb.7:
+; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-SLOW-NEXT:    orl %ebp, %eax
+; X86-SLOW-NEXT:    orl %edi, %ecx
+; X86-SLOW-NEXT:    movl %ecx, %edx
+; X86-SLOW-NEXT:  .LBB5_8:
 ; X86-SLOW-NEXT:    addl $8, %esp
 ; X86-SLOW-NEXT:    popl %esi
 ; X86-SLOW-NEXT:    popl %edi
@@ -363,17 +324,14 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ;
 ; X64-SLOW-LABEL: var_shift_i64:
 ; X64-SLOW:       # %bb.0:
-; X64-SLOW-NEXT:    movq %rdi, %rax
-; X64-SLOW-NEXT:    movq %rsi, %rdi
-; X64-SLOW-NEXT:    movl %edx, %ecx
-; X64-SLOW-NEXT:    shrq %cl, %rdi
-; X64-SLOW-NEXT:    andb $63, %dl
-; X64-SLOW-NEXT:    movl %edx, %ecx
-; X64-SLOW-NEXT:    negb %cl
+; X64-SLOW-NEXT:    movq %rdx, %rcx
+; X64-SLOW-NEXT:    shrq %cl, %rsi
+; X64-SLOW-NEXT:    leaq (%rdi,%rdi), %rax
+; X64-SLOW-NEXT:    andb $63, %cl
+; X64-SLOW-NEXT:    xorb $63, %cl
+; X64-SLOW-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-SLOW-NEXT:    shlq %cl, %rax
-; X64-SLOW-NEXT:    orq %rdi, %rax
-; X64-SLOW-NEXT:    testb %dl, %dl
-; X64-SLOW-NEXT:    cmoveq %rsi, %rax
+; X64-SLOW-NEXT:    orq %rsi, %rax
 ; X64-SLOW-NEXT:    retq
   %tmp = tail call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 %z)
   ret i64 %tmp