[llvm] 1794143 - [TargetLowering] Improve expansion of FSHL/FSHR
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Thu May 14 08:36:36 PDT 2020
Author: Jay Foad
Date: 2020-05-14T16:36:22+01:00
New Revision: 17941437a2ed8abefef719345391da94e6df8ebb
URL: https://github.com/llvm/llvm-project/commit/17941437a2ed8abefef719345391da94e6df8ebb
DIFF: https://github.com/llvm/llvm-project/commit/17941437a2ed8abefef719345391da94e6df8ebb.diff
LOG: [TargetLowering] Improve expansion of FSHL/FSHR
Use an extra shift-by-1 instead of a compare and select to handle the
shift-by-zero case. This sometimes saves one instruction (if the compare
couldn't be combined with a previous instruction). It also works better
on targets that don't have good select instructions.
Note that currently this change doesn't affect most targets because
expandFunnelShift is not used because funnel shift intrinsics are
lowered early in SelectionDAGBuilder. But there is work afoot to change
that; see D77152.
Differential Revision: https://reviews.llvm.org/D77301
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/test/CodeGen/X86/fshl.ll
llvm/test/CodeGen/X86/fshr.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e0f7040165d5..90f9e17240a3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6046,8 +6046,8 @@ bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result,
!isOperationLegalOrCustomOrPromote(ISD::OR, VT)))
return false;
- // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
- // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
+ // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
+ // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
SDValue X = Node->getOperand(0);
SDValue Y = Node->getOperand(1);
SDValue Z = Node->getOperand(2);
@@ -6057,30 +6057,29 @@ bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result,
SDLoc DL(SDValue(Node, 0));
EVT ShVT = Z.getValueType();
- SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT);
- SDValue Zero = DAG.getConstant(0, DL, ShVT);
-
+ SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, ShVT);
SDValue ShAmt;
if (isPowerOf2_32(EltSizeInBits)) {
- SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, ShVT);
+ // Z % BW -> Z & (BW - 1)
ShAmt = DAG.getNode(ISD::AND, DL, ShVT, Z, Mask);
} else {
+ SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT);
ShAmt = DAG.getNode(ISD::UREM, DL, ShVT, Z, BitWidthC);
}
+ SDValue InvShAmt = DAG.getNode(ISD::SUB, DL, ShVT, Mask, ShAmt);
- SDValue InvShAmt = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, ShAmt);
- SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, X, IsFSHL ? ShAmt : InvShAmt);
- SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Y, IsFSHL ? InvShAmt : ShAmt);
- SDValue Or = DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
-
- // If (Z % BW == 0), then the opposite direction shift is shift-by-bitwidth,
- // and that is undefined. We must compare and select to avoid UB.
- EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ShVT);
-
- // For fshl, 0-shift returns the 1st arg (X).
- // For fshr, 0-shift returns the 2nd arg (Y).
- SDValue IsZeroShift = DAG.getSetCC(DL, CCVT, ShAmt, Zero, ISD::SETEQ);
- Result = DAG.getSelect(DL, VT, IsZeroShift, IsFSHL ? X : Y, Or);
+ SDValue One = DAG.getConstant(1, DL, ShVT);
+ SDValue ShX, ShY;
+ if (IsFSHL) {
+ ShX = DAG.getNode(ISD::SHL, DL, VT, X, ShAmt);
+ SDValue ShY1 = DAG.getNode(ISD::SRL, DL, VT, Y, One);
+ ShY = DAG.getNode(ISD::SRL, DL, VT, ShY1, InvShAmt);
+ } else {
+ SDValue ShX1 = DAG.getNode(ISD::SHL, DL, VT, X, One);
+ ShX = DAG.getNode(ISD::SHL, DL, VT, ShX1, InvShAmt);
+ ShY = DAG.getNode(ISD::SRL, DL, VT, Y, ShAmt);
+ }
+ Result = DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
return true;
}
diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
index 1da714067aa4..99f5a3e923bb 100644
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -65,27 +65,16 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
;
; X86-SLOW-LABEL: var_shift_i16:
; X86-SLOW: # %bb.0:
-; X86-SLOW-NEXT: pushl %edi
-; X86-SLOW-NEXT: pushl %esi
-; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl
-; X86-SLOW-NEXT: andb $15, %dl
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT: movl %eax, %edi
-; X86-SLOW-NEXT: movl %edx, %ecx
-; X86-SLOW-NEXT: shll %cl, %edi
-; X86-SLOW-NEXT: movb $16, %cl
-; X86-SLOW-NEXT: subb %dl, %cl
-; X86-SLOW-NEXT: shrl %cl, %esi
-; X86-SLOW-NEXT: testb %dl, %dl
-; X86-SLOW-NEXT: je .LBB1_2
-; X86-SLOW-NEXT: # %bb.1:
-; X86-SLOW-NEXT: orl %esi, %edi
-; X86-SLOW-NEXT: movl %edi, %eax
-; X86-SLOW-NEXT: .LBB1_2:
+; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-SLOW-NEXT: andb $15, %cl
+; X86-SLOW-NEXT: shll %cl, %edx
+; X86-SLOW-NEXT: shrl %eax
+; X86-SLOW-NEXT: xorb $15, %cl
+; X86-SLOW-NEXT: shrl %cl, %eax
+; X86-SLOW-NEXT: orl %edx, %eax
; X86-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
-; X86-SLOW-NEXT: popl %esi
-; X86-SLOW-NEXT: popl %edi
; X86-SLOW-NEXT: retl
;
; X64-FAST-LABEL: var_shift_i16:
@@ -100,17 +89,15 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
;
; X64-SLOW-LABEL: var_shift_i16:
; X64-SLOW: # %bb.0:
-; X64-SLOW-NEXT: movzwl %si, %eax
-; X64-SLOW-NEXT: andb $15, %dl
-; X64-SLOW-NEXT: movl %edi, %esi
; X64-SLOW-NEXT: movl %edx, %ecx
-; X64-SLOW-NEXT: shll %cl, %esi
-; X64-SLOW-NEXT: movb $16, %cl
-; X64-SLOW-NEXT: subb %dl, %cl
+; X64-SLOW-NEXT: movzwl %si, %eax
+; X64-SLOW-NEXT: andb $15, %cl
+; X64-SLOW-NEXT: shll %cl, %edi
+; X64-SLOW-NEXT: xorb $15, %cl
+; X64-SLOW-NEXT: shrl %eax
+; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-SLOW-NEXT: shrl %cl, %eax
-; X64-SLOW-NEXT: orl %esi, %eax
-; X64-SLOW-NEXT: testb %dl, %dl
-; X64-SLOW-NEXT: cmovel %edi, %eax
+; X64-SLOW-NEXT: orl %edi, %eax
; X64-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
; X64-SLOW-NEXT: retq
%tmp = tail call i16 @llvm.fshl.i16(i16 %x, i16 %y, i16 %z)
@@ -128,26 +115,15 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
;
; X86-SLOW-LABEL: var_shift_i32:
; X86-SLOW: # %bb.0:
-; X86-SLOW-NEXT: pushl %edi
-; X86-SLOW-NEXT: pushl %esi
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT: movl %eax, %edi
-; X86-SLOW-NEXT: movl %edx, %ecx
-; X86-SLOW-NEXT: shll %cl, %edi
-; X86-SLOW-NEXT: andb $31, %dl
-; X86-SLOW-NEXT: movl %edx, %ecx
-; X86-SLOW-NEXT: negb %cl
-; X86-SLOW-NEXT: shrl %cl, %esi
-; X86-SLOW-NEXT: testb %dl, %dl
-; X86-SLOW-NEXT: je .LBB2_2
-; X86-SLOW-NEXT: # %bb.1:
-; X86-SLOW-NEXT: orl %esi, %edi
-; X86-SLOW-NEXT: movl %edi, %eax
-; X86-SLOW-NEXT: .LBB2_2:
-; X86-SLOW-NEXT: popl %esi
-; X86-SLOW-NEXT: popl %edi
+; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT: shll %cl, %edx
+; X86-SLOW-NEXT: shrl %eax
+; X86-SLOW-NEXT: andb $31, %cl
+; X86-SLOW-NEXT: xorb $31, %cl
+; X86-SLOW-NEXT: shrl %cl, %eax
+; X86-SLOW-NEXT: orl %edx, %eax
; X86-SLOW-NEXT: retl
;
; X64-FAST-LABEL: var_shift_i32:
@@ -160,17 +136,15 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
;
; X64-SLOW-LABEL: var_shift_i32:
; X64-SLOW: # %bb.0:
-; X64-SLOW-NEXT: movl %esi, %eax
-; X64-SLOW-NEXT: movl %edi, %esi
; X64-SLOW-NEXT: movl %edx, %ecx
-; X64-SLOW-NEXT: shll %cl, %esi
-; X64-SLOW-NEXT: andb $31, %dl
-; X64-SLOW-NEXT: movl %edx, %ecx
-; X64-SLOW-NEXT: negb %cl
+; X64-SLOW-NEXT: movl %esi, %eax
+; X64-SLOW-NEXT: shll %cl, %edi
+; X64-SLOW-NEXT: shrl %eax
+; X64-SLOW-NEXT: andb $31, %cl
+; X64-SLOW-NEXT: xorb $31, %cl
+; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-SLOW-NEXT: shrl %cl, %eax
-; X64-SLOW-NEXT: orl %esi, %eax
-; X64-SLOW-NEXT: testb %dl, %dl
-; X64-SLOW-NEXT: cmovel %edi, %eax
+; X64-SLOW-NEXT: orl %edi, %eax
; X64-SLOW-NEXT: retq
%tmp = tail call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
ret i32 %tmp
@@ -279,78 +253,61 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
; X86-SLOW-NEXT: pushl %ebx
; X86-SLOW-NEXT: pushl %edi
; X86-SLOW-NEXT: pushl %esi
-; X86-SLOW-NEXT: subl $8, %esp
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-SLOW-NEXT: andl $63, %ebx
-; X86-SLOW-NEXT: movb $64, %dh
-; X86-SLOW-NEXT: subb %bl, %dh
-; X86-SLOW-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT: movb %dh, %cl
-; X86-SLOW-NEXT: shrl %cl, %eax
-; X86-SLOW-NEXT: movb %dh, %dl
-; X86-SLOW-NEXT: andb $31, %dl
-; X86-SLOW-NEXT: movl %edx, %ecx
-; X86-SLOW-NEXT: negb %cl
-; X86-SLOW-NEXT: movl %esi, %ebp
-; X86-SLOW-NEXT: shll %cl, %ebp
-; X86-SLOW-NEXT: testb %dl, %dl
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT: je .LBB5_2
-; X86-SLOW-NEXT: # %bb.1:
-; X86-SLOW-NEXT: orl %eax, %ebp
-; X86-SLOW-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT: .LBB5_2:
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-SLOW-NEXT: movl %ebp, %eax
-; X86-SLOW-NEXT: movl %ebx, %ecx
-; X86-SLOW-NEXT: shll %cl, %eax
-; X86-SLOW-NEXT: movb %bl, %ch
-; X86-SLOW-NEXT: andb $31, %ch
+; X86-SLOW-NEXT: movb $64, %ch
+; X86-SLOW-NEXT: subb %bl, %ch
; X86-SLOW-NEXT: movb %ch, %cl
-; X86-SLOW-NEXT: negb %cl
+; X86-SLOW-NEXT: shrl %cl, %edx
+; X86-SLOW-NEXT: andb $31, %cl
+; X86-SLOW-NEXT: xorb $31, %cl
+; X86-SLOW-NEXT: addl %eax, %eax
+; X86-SLOW-NEXT: shll %cl, %eax
+; X86-SLOW-NEXT: movb %bl, %cl
+; X86-SLOW-NEXT: shll %cl, %ebp
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT: movl %esi, %edi
+; X86-SLOW-NEXT: shrl %edi
+; X86-SLOW-NEXT: andb $31, %cl
+; X86-SLOW-NEXT: xorb $31, %cl
; X86-SLOW-NEXT: shrl %cl, %edi
-; X86-SLOW-NEXT: testb %ch, %ch
-; X86-SLOW-NEXT: je .LBB5_4
-; X86-SLOW-NEXT: # %bb.3:
-; X86-SLOW-NEXT: orl %edi, %eax
-; X86-SLOW-NEXT: movl %eax, %ebp
-; X86-SLOW-NEXT: .LBB5_4:
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT: movl %eax, %edi
-; X86-SLOW-NEXT: movl %ebx, %ecx
-; X86-SLOW-NEXT: shll %cl, %edi
+; X86-SLOW-NEXT: movb %bl, %cl
+; X86-SLOW-NEXT: shll %cl, %esi
; X86-SLOW-NEXT: testb $32, %bl
-; X86-SLOW-NEXT: je .LBB5_6
+; X86-SLOW-NEXT: jne .LBB5_1
+; X86-SLOW-NEXT: # %bb.2:
+; X86-SLOW-NEXT: orl %edi, %ebp
+; X86-SLOW-NEXT: jmp .LBB5_3
+; X86-SLOW-NEXT: .LBB5_1:
+; X86-SLOW-NEXT: movl %esi, %ebp
+; X86-SLOW-NEXT: xorl %esi, %esi
+; X86-SLOW-NEXT: .LBB5_3:
+; X86-SLOW-NEXT: movb %ch, %cl
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-SLOW-NEXT: shrl %cl, %edi
+; X86-SLOW-NEXT: testb $32, %ch
+; X86-SLOW-NEXT: jne .LBB5_4
; X86-SLOW-NEXT: # %bb.5:
-; X86-SLOW-NEXT: movl %edi, %ebp
+; X86-SLOW-NEXT: orl %edx, %eax
+; X86-SLOW-NEXT: movl %eax, %ecx
+; X86-SLOW-NEXT: jmp .LBB5_6
+; X86-SLOW-NEXT: .LBB5_4:
+; X86-SLOW-NEXT: movl %edi, %ecx
; X86-SLOW-NEXT: xorl %edi, %edi
; X86-SLOW-NEXT: .LBB5_6:
-; X86-SLOW-NEXT: movb %dh, %cl
-; X86-SLOW-NEXT: shrl %cl, %esi
-; X86-SLOW-NEXT: testb $32, %dh
-; X86-SLOW-NEXT: jne .LBB5_7
-; X86-SLOW-NEXT: # %bb.8:
-; X86-SLOW-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-SLOW-NEXT: testl %ebx, %ebx
-; X86-SLOW-NEXT: jne .LBB5_10
-; X86-SLOW-NEXT: jmp .LBB5_11
-; X86-SLOW-NEXT: .LBB5_7:
-; X86-SLOW-NEXT: movl %esi, %ecx
-; X86-SLOW-NEXT: xorl %esi, %esi
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: testl %ebx, %ebx
-; X86-SLOW-NEXT: je .LBB5_11
-; X86-SLOW-NEXT: .LBB5_10:
-; X86-SLOW-NEXT: orl %esi, %ebp
-; X86-SLOW-NEXT: orl %ecx, %edi
-; X86-SLOW-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT: movl %edi, %eax
-; X86-SLOW-NEXT: .LBB5_11:
-; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-SLOW-NEXT: addl $8, %esp
+; X86-SLOW-NEXT: je .LBB5_8
+; X86-SLOW-NEXT: # %bb.7:
+; X86-SLOW-NEXT: orl %edi, %ebp
+; X86-SLOW-NEXT: orl %ecx, %esi
+; X86-SLOW-NEXT: movl %ebp, %edx
+; X86-SLOW-NEXT: movl %esi, %eax
+; X86-SLOW-NEXT: .LBB5_8:
; X86-SLOW-NEXT: popl %esi
; X86-SLOW-NEXT: popl %edi
; X86-SLOW-NEXT: popl %ebx
@@ -367,17 +324,15 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
;
; X64-SLOW-LABEL: var_shift_i64:
; X64-SLOW: # %bb.0:
+; X64-SLOW-NEXT: movq %rdx, %rcx
; X64-SLOW-NEXT: movq %rsi, %rax
-; X64-SLOW-NEXT: movq %rdi, %rsi
-; X64-SLOW-NEXT: movl %edx, %ecx
-; X64-SLOW-NEXT: shlq %cl, %rsi
-; X64-SLOW-NEXT: andb $63, %dl
-; X64-SLOW-NEXT: movl %edx, %ecx
-; X64-SLOW-NEXT: negb %cl
+; X64-SLOW-NEXT: shlq %cl, %rdi
+; X64-SLOW-NEXT: shrq %rax
+; X64-SLOW-NEXT: andb $63, %cl
+; X64-SLOW-NEXT: xorb $63, %cl
+; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $rcx
; X64-SLOW-NEXT: shrq %cl, %rax
-; X64-SLOW-NEXT: orq %rsi, %rax
-; X64-SLOW-NEXT: testb %dl, %dl
-; X64-SLOW-NEXT: cmoveq %rdi, %rax
+; X64-SLOW-NEXT: orq %rdi, %rax
; X64-SLOW-NEXT: retq
%tmp = tail call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 %z)
ret i64 %tmp
diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
index 06fe8dfc2413..6e18c13fecb1 100644
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -65,27 +65,16 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
;
; X86-SLOW-LABEL: var_shift_i16:
; X86-SLOW: # %bb.0:
-; X86-SLOW-NEXT: pushl %edi
-; X86-SLOW-NEXT: pushl %esi
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl
-; X86-SLOW-NEXT: andb $15, %dl
-; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT: movl %eax, %edi
-; X86-SLOW-NEXT: movl %edx, %ecx
-; X86-SLOW-NEXT: shrl %cl, %edi
-; X86-SLOW-NEXT: movb $16, %cl
-; X86-SLOW-NEXT: subb %dl, %cl
-; X86-SLOW-NEXT: shll %cl, %esi
-; X86-SLOW-NEXT: testb %dl, %dl
-; X86-SLOW-NEXT: je .LBB1_2
-; X86-SLOW-NEXT: # %bb.1:
-; X86-SLOW-NEXT: orl %edi, %esi
-; X86-SLOW-NEXT: movl %esi, %eax
-; X86-SLOW-NEXT: .LBB1_2:
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-SLOW-NEXT: andb $15, %cl
+; X86-SLOW-NEXT: shrl %cl, %edx
+; X86-SLOW-NEXT: addl %eax, %eax
+; X86-SLOW-NEXT: xorb $15, %cl
+; X86-SLOW-NEXT: shll %cl, %eax
+; X86-SLOW-NEXT: orl %edx, %eax
; X86-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
-; X86-SLOW-NEXT: popl %esi
-; X86-SLOW-NEXT: popl %edi
; X86-SLOW-NEXT: retl
;
; X64-FAST-LABEL: var_shift_i16:
@@ -100,16 +89,16 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
;
; X64-SLOW-LABEL: var_shift_i16:
; X64-SLOW: # %bb.0:
-; X64-SLOW-NEXT: movzwl %si, %eax
-; X64-SLOW-NEXT: andb $15, %dl
; X64-SLOW-NEXT: movl %edx, %ecx
-; X64-SLOW-NEXT: shrl %cl, %eax
-; X64-SLOW-NEXT: movb $16, %cl
-; X64-SLOW-NEXT: subb %dl, %cl
-; X64-SLOW-NEXT: shll %cl, %edi
-; X64-SLOW-NEXT: orl %edi, %eax
-; X64-SLOW-NEXT: testb %dl, %dl
-; X64-SLOW-NEXT: cmovel %esi, %eax
+; X64-SLOW-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-SLOW-NEXT: movzwl %si, %edx
+; X64-SLOW-NEXT: andb $15, %cl
+; X64-SLOW-NEXT: shrl %cl, %edx
+; X64-SLOW-NEXT: leal (%rdi,%rdi), %eax
+; X64-SLOW-NEXT: xorb $15, %cl
+; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-SLOW-NEXT: shll %cl, %eax
+; X64-SLOW-NEXT: orl %edx, %eax
; X64-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
; X64-SLOW-NEXT: retq
%tmp = tail call i16 @llvm.fshr.i16(i16 %x, i16 %y, i16 %z)
@@ -127,26 +116,15 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
;
; X86-SLOW-LABEL: var_shift_i32:
; X86-SLOW: # %bb.0:
-; X86-SLOW-NEXT: pushl %edi
-; X86-SLOW-NEXT: pushl %esi
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT: movl %eax, %edi
-; X86-SLOW-NEXT: movl %edx, %ecx
-; X86-SLOW-NEXT: shrl %cl, %edi
-; X86-SLOW-NEXT: andb $31, %dl
-; X86-SLOW-NEXT: movl %edx, %ecx
-; X86-SLOW-NEXT: negb %cl
-; X86-SLOW-NEXT: shll %cl, %esi
-; X86-SLOW-NEXT: testb %dl, %dl
-; X86-SLOW-NEXT: je .LBB2_2
-; X86-SLOW-NEXT: # %bb.1:
-; X86-SLOW-NEXT: orl %edi, %esi
-; X86-SLOW-NEXT: movl %esi, %eax
-; X86-SLOW-NEXT: .LBB2_2:
-; X86-SLOW-NEXT: popl %esi
-; X86-SLOW-NEXT: popl %edi
+; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT: shrl %cl, %edx
+; X86-SLOW-NEXT: addl %eax, %eax
+; X86-SLOW-NEXT: andb $31, %cl
+; X86-SLOW-NEXT: xorb $31, %cl
+; X86-SLOW-NEXT: shll %cl, %eax
+; X86-SLOW-NEXT: orl %edx, %eax
; X86-SLOW-NEXT: retl
;
; X64-FAST-LABEL: var_shift_i32:
@@ -159,17 +137,15 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
;
; X64-SLOW-LABEL: var_shift_i32:
; X64-SLOW: # %bb.0:
-; X64-SLOW-NEXT: movl %edi, %eax
-; X64-SLOW-NEXT: movl %esi, %edi
-; X64-SLOW-NEXT: movl %edx, %ecx
-; X64-SLOW-NEXT: shrl %cl, %edi
-; X64-SLOW-NEXT: andb $31, %dl
; X64-SLOW-NEXT: movl %edx, %ecx
-; X64-SLOW-NEXT: negb %cl
+; X64-SLOW-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-SLOW-NEXT: shrl %cl, %esi
+; X64-SLOW-NEXT: leal (%rdi,%rdi), %eax
+; X64-SLOW-NEXT: andb $31, %cl
+; X64-SLOW-NEXT: xorb $31, %cl
+; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-SLOW-NEXT: shll %cl, %eax
-; X64-SLOW-NEXT: orl %edi, %eax
-; X64-SLOW-NEXT: testb %dl, %dl
-; X64-SLOW-NEXT: cmovel %esi, %eax
+; X64-SLOW-NEXT: orl %esi, %eax
; X64-SLOW-NEXT: retq
%tmp = tail call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
ret i32 %tmp
@@ -276,76 +252,61 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
; X86-SLOW-NEXT: pushl %esi
; X86-SLOW-NEXT: subl $8, %esp
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-SLOW-NEXT: andl $63, %ebx
-; X86-SLOW-NEXT: movb $64, %al
-; X86-SLOW-NEXT: subb %bl, %al
-; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT: movl %eax, %ecx
-; X86-SLOW-NEXT: shll %cl, %edx
-; X86-SLOW-NEXT: movb %al, %ch
-; X86-SLOW-NEXT: andb $31, %ch
+; X86-SLOW-NEXT: movb $64, %ch
+; X86-SLOW-NEXT: subb %bl, %ch
; X86-SLOW-NEXT: movb %ch, %cl
-; X86-SLOW-NEXT: negb %cl
-; X86-SLOW-NEXT: movl %esi, %edi
-; X86-SLOW-NEXT: shrl %cl, %edi
-; X86-SLOW-NEXT: testb %ch, %ch
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-SLOW-NEXT: je .LBB5_2
-; X86-SLOW-NEXT: # %bb.1:
-; X86-SLOW-NEXT: orl %edi, %edx
-; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT: .LBB5_2:
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT: movl %ebx, %ecx
-; X86-SLOW-NEXT: shrl %cl, %edx
-; X86-SLOW-NEXT: movb %bl, %ah
-; X86-SLOW-NEXT: andb $31, %ah
-; X86-SLOW-NEXT: movb %ah, %cl
-; X86-SLOW-NEXT: negb %cl
-; X86-SLOW-NEXT: movl %ebp, %edi
-; X86-SLOW-NEXT: shll %cl, %edi
-; X86-SLOW-NEXT: testb %ah, %ah
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-SLOW-NEXT: je .LBB5_4
-; X86-SLOW-NEXT: # %bb.3:
-; X86-SLOW-NEXT: orl %edx, %edi
-; X86-SLOW-NEXT: movl %edi, %ebp
-; X86-SLOW-NEXT: .LBB5_4:
+; X86-SLOW-NEXT: shll %cl, %edx
+; X86-SLOW-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT: movl %esi, %edx
+; X86-SLOW-NEXT: andb $31, %cl
+; X86-SLOW-NEXT: xorb $31, %cl
+; X86-SLOW-NEXT: shrl %esi
+; X86-SLOW-NEXT: shrl %cl, %esi
+; X86-SLOW-NEXT: movb %bl, %cl
+; X86-SLOW-NEXT: shrl %cl, %eax
+; X86-SLOW-NEXT: andb $31, %cl
+; X86-SLOW-NEXT: xorb $31, %cl
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-SLOW-NEXT: movl %ebx, %ecx
+; X86-SLOW-NEXT: leal (%edi,%edi), %ebp
+; X86-SLOW-NEXT: shll %cl, %ebp
+; X86-SLOW-NEXT: movb %bl, %cl
; X86-SLOW-NEXT: shrl %cl, %edi
; X86-SLOW-NEXT: testb $32, %bl
-; X86-SLOW-NEXT: je .LBB5_6
-; X86-SLOW-NEXT: # %bb.5:
+; X86-SLOW-NEXT: jne .LBB5_1
+; X86-SLOW-NEXT: # %bb.2:
+; X86-SLOW-NEXT: orl %eax, %ebp
+; X86-SLOW-NEXT: jmp .LBB5_3
+; X86-SLOW-NEXT: .LBB5_1:
; X86-SLOW-NEXT: movl %edi, %ebp
; X86-SLOW-NEXT: xorl %edi, %edi
+; X86-SLOW-NEXT: .LBB5_3:
+; X86-SLOW-NEXT: movb %ch, %cl
+; X86-SLOW-NEXT: shll %cl, %edx
+; X86-SLOW-NEXT: testb $32, %ch
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT: jne .LBB5_4
+; X86-SLOW-NEXT: # %bb.5:
+; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SLOW-NEXT: orl %esi, %ecx
+; X86-SLOW-NEXT: jmp .LBB5_6
+; X86-SLOW-NEXT: .LBB5_4:
+; X86-SLOW-NEXT: movl %edx, %ecx
+; X86-SLOW-NEXT: movl $0, (%esp) # 4-byte Folded Spill
; X86-SLOW-NEXT: .LBB5_6:
-; X86-SLOW-NEXT: movl %eax, %ecx
-; X86-SLOW-NEXT: shll %cl, %esi
-; X86-SLOW-NEXT: testb $32, %al
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT: jne .LBB5_7
-; X86-SLOW-NEXT: # %bb.8:
-; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-SLOW-NEXT: testl %ebx, %ebx
-; X86-SLOW-NEXT: jne .LBB5_10
-; X86-SLOW-NEXT: jmp .LBB5_11
-; X86-SLOW-NEXT: .LBB5_7:
-; X86-SLOW-NEXT: movl %esi, %eax
-; X86-SLOW-NEXT: xorl %esi, %esi
; X86-SLOW-NEXT: testl %ebx, %ebx
-; X86-SLOW-NEXT: je .LBB5_11
-; X86-SLOW-NEXT: .LBB5_10:
-; X86-SLOW-NEXT: orl %ebp, %esi
-; X86-SLOW-NEXT: orl %edi, %eax
-; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT: movl %eax, %edx
-; X86-SLOW-NEXT: .LBB5_11:
-; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SLOW-NEXT: je .LBB5_8
+; X86-SLOW-NEXT: # %bb.7:
+; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-SLOW-NEXT: orl %ebp, %eax
+; X86-SLOW-NEXT: orl %edi, %ecx
+; X86-SLOW-NEXT: movl %ecx, %edx
+; X86-SLOW-NEXT: .LBB5_8:
; X86-SLOW-NEXT: addl $8, %esp
; X86-SLOW-NEXT: popl %esi
; X86-SLOW-NEXT: popl %edi
@@ -363,17 +324,14 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
;
; X64-SLOW-LABEL: var_shift_i64:
; X64-SLOW: # %bb.0:
-; X64-SLOW-NEXT: movq %rdi, %rax
-; X64-SLOW-NEXT: movq %rsi, %rdi
-; X64-SLOW-NEXT: movl %edx, %ecx
-; X64-SLOW-NEXT: shrq %cl, %rdi
-; X64-SLOW-NEXT: andb $63, %dl
-; X64-SLOW-NEXT: movl %edx, %ecx
-; X64-SLOW-NEXT: negb %cl
+; X64-SLOW-NEXT: movq %rdx, %rcx
+; X64-SLOW-NEXT: shrq %cl, %rsi
+; X64-SLOW-NEXT: leaq (%rdi,%rdi), %rax
+; X64-SLOW-NEXT: andb $63, %cl
+; X64-SLOW-NEXT: xorb $63, %cl
+; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $rcx
; X64-SLOW-NEXT: shlq %cl, %rax
-; X64-SLOW-NEXT: orq %rdi, %rax
-; X64-SLOW-NEXT: testb %dl, %dl
-; X64-SLOW-NEXT: cmoveq %rsi, %rax
+; X64-SLOW-NEXT: orq %rsi, %rax
; X64-SLOW-NEXT: retq
%tmp = tail call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 %z)
ret i64 %tmp
More information about the llvm-commits
mailing list