[llvm] cc65a7a - [X86] Improve i8 + 'slow' i16 funnel shift codegen
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sun May 24 00:09:21 PDT 2020
Author: Simon Pilgrim
Date: 2020-05-24T08:08:53+01:00
New Revision: cc65a7a5ea81f8cb5068e99d9bf407745623c624
URL: https://github.com/llvm/llvm-project/commit/cc65a7a5ea81f8cb5068e99d9bf407745623c624
DIFF: https://github.com/llvm/llvm-project/commit/cc65a7a5ea81f8cb5068e99d9bf407745623c624.diff
LOG: [X86] Improve i8 + 'slow' i16 funnel shift codegen
This is a preliminary patch before I deal with the xor+and issue raised in D77301.
We get much better code for i8/i16 funnel shifts by concatenating the operands together and performing the shift as a double width type, it avoids repeated use of the shift amount and partial registers.
fshl(x,y,z) -> (((zext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
fshr(x,y,z) -> (((zext(x) << bw) | zext(y)) >> (z & (bw-1))) >> bw.
Alive2: http://volta.cs.utah.edu:8080/z/CZx7Cn
This doesn't do as well for i32 cases on x86_64 (the xor+and followup patch is much better) so I haven't bothered with that.
Cases with constant amounts are more dubious as well so I haven't currently bothered with those - its these kind of 'edge' cases that put me off trying to put this in TargetLowering::expandFunnelShift.
Differential Revision: https://reviews.llvm.org/D80466
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/fshl.ll
llvm/test/CodeGen/X86/fshr.ll
llvm/test/CodeGen/X86/rotate-extract.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 94e2e106abc7..4d51faecec7f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -201,6 +201,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// For slow shld targets we only lower for code size.
LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
+ setOperationAction(ShiftOp , MVT::i8 , Custom);
setOperationAction(ShiftOp , MVT::i16 , Custom);
setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
if (Subtarget.is64Bit())
@@ -19074,13 +19075,36 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
Op0, Op1, Amt);
}
-
- assert((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
- "Unexpected funnel shift type!");
+ assert(
+ (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
+ "Unexpected funnel shift type!");
// Expand slow SHLD/SHRD cases if we are not optimizing for size.
bool OptForSize = DAG.shouldOptForSize();
- if (!OptForSize && Subtarget.isSHLDSlow())
+ bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
+
+ // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
+ // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))) >> bw.
+ if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
+ !isa<ConstantSDNode>(Amt)) {
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
+ SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
+ Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
+ Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
+ Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
+ SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
+ Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
+ if (IsFSHR) {
+ Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
+ } else {
+ Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
+ Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
+ }
+ return DAG.getZExtOrTrunc(Res, DL, VT);
+ }
+
+ if (VT == MVT::i8 || ExpandFunnel)
return SDValue();
// i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
index 99f5a3e923bb..ce9709136bd8 100644
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -16,37 +16,26 @@ declare i64 @llvm.fshl.i64(i64, i64, i64) nounwind readnone
define i8 @var_shift_i8(i8 %x, i8 %y, i8 %z) nounwind {
; X86-LABEL: var_shift_i8:
; X86: # %bb.0:
-; X86-NEXT: movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT: movb {{[0-9]+}}(%esp), %al
-; X86-NEXT: movb {{[0-9]+}}(%esp), %dl
-; X86-NEXT: andb $7, %dl
-; X86-NEXT: movb %al, %ch
-; X86-NEXT: movb %dl, %cl
-; X86-NEXT: shlb %cl, %ch
-; X86-NEXT: movb $8, %cl
-; X86-NEXT: subb %dl, %cl
-; X86-NEXT: shrb %cl, %ah
-; X86-NEXT: testb %dl, %dl
-; X86-NEXT: je .LBB0_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: orb %ah, %ch
-; X86-NEXT: movb %ch, %al
-; X86-NEXT: .LBB0_2:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shll $8, %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: andb $7, %cl
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movb %ah, %al
; X86-NEXT: retl
;
; X64-LABEL: var_shift_i8:
; X64: # %bb.0:
-; X64-NEXT: andb $7, %dl
-; X64-NEXT: movl %edi, %eax
; X64-NEXT: movl %edx, %ecx
-; X64-NEXT: shlb %cl, %al
-; X64-NEXT: movb $8, %cl
-; X64-NEXT: subb %dl, %cl
-; X64-NEXT: shrb %cl, %sil
-; X64-NEXT: orb %al, %sil
+; X64-NEXT: shll $8, %edi
; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: testb %dl, %dl
-; X64-NEXT: cmovel %edi, %eax
+; X64-NEXT: orl %edi, %eax
+; X64-NEXT: andb $7, %cl
+; X64-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NEXT: shll %cl, %eax
+; X64-NEXT: shrl $8, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
%tmp = tail call i8 @llvm.fshl.i8(i8 %x, i8 %y, i8 %z)
@@ -65,15 +54,14 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
;
; X86-SLOW-LABEL: var_shift_i16:
; X86-SLOW: # %bb.0:
-; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %cl
-; X86-SLOW-NEXT: andb $15, %cl
-; X86-SLOW-NEXT: shll %cl, %edx
-; X86-SLOW-NEXT: shrl %eax
-; X86-SLOW-NEXT: xorb $15, %cl
-; X86-SLOW-NEXT: shrl %cl, %eax
+; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT: shll $16, %eax
; X86-SLOW-NEXT: orl %edx, %eax
+; X86-SLOW-NEXT: andb $15, %cl
+; X86-SLOW-NEXT: shll %cl, %eax
+; X86-SLOW-NEXT: shrl $16, %eax
; X86-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
; X86-SLOW-NEXT: retl
;
@@ -90,14 +78,13 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
; X64-SLOW-LABEL: var_shift_i16:
; X64-SLOW: # %bb.0:
; X64-SLOW-NEXT: movl %edx, %ecx
+; X64-SLOW-NEXT: shll $16, %edi
; X64-SLOW-NEXT: movzwl %si, %eax
+; X64-SLOW-NEXT: orl %edi, %eax
; X64-SLOW-NEXT: andb $15, %cl
-; X64-SLOW-NEXT: shll %cl, %edi
-; X64-SLOW-NEXT: xorb $15, %cl
-; X64-SLOW-NEXT: shrl %eax
; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-SLOW-NEXT: shrl %cl, %eax
-; X64-SLOW-NEXT: orl %edi, %eax
+; X64-SLOW-NEXT: shll %cl, %eax
+; X64-SLOW-NEXT: shrl $16, %eax
; X64-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
; X64-SLOW-NEXT: retq
%tmp = tail call i16 @llvm.fshl.i16(i16 %x, i16 %y, i16 %z)
diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
index 6e18c13fecb1..7f9d10f2fd2f 100644
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -16,37 +16,25 @@ declare i64 @llvm.fshr.i64(i64, i64, i64) nounwind readnone
define i8 @var_shift_i8(i8 %x, i8 %y, i8 %z) nounwind {
; X86-LABEL: var_shift_i8:
; X86: # %bb.0:
-; X86-NEXT: movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT: movb {{[0-9]+}}(%esp), %al
-; X86-NEXT: movb {{[0-9]+}}(%esp), %dl
-; X86-NEXT: andb $7, %dl
-; X86-NEXT: movb %al, %ch
-; X86-NEXT: movb %dl, %cl
-; X86-NEXT: shrb %cl, %ch
-; X86-NEXT: movb $8, %cl
-; X86-NEXT: subb %dl, %cl
-; X86-NEXT: shlb %cl, %ah
-; X86-NEXT: testb %dl, %dl
-; X86-NEXT: je .LBB0_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: orb %ch, %ah
-; X86-NEXT: movb %ah, %al
-; X86-NEXT: .LBB0_2:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shll $8, %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: andb $7, %cl
+; X86-NEXT: shrl %cl, %eax
+; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
;
; X64-LABEL: var_shift_i8:
; X64: # %bb.0:
-; X64-NEXT: andb $7, %dl
-; X64-NEXT: movl %esi, %eax
; X64-NEXT: movl %edx, %ecx
-; X64-NEXT: shrb %cl, %al
-; X64-NEXT: movb $8, %cl
-; X64-NEXT: subb %dl, %cl
-; X64-NEXT: shlb %cl, %dil
-; X64-NEXT: orb %al, %dil
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: testb %dl, %dl
-; X64-NEXT: cmovel %esi, %eax
+; X64-NEXT: shll $8, %edi
+; X64-NEXT: movzbl %sil, %eax
+; X64-NEXT: orl %edi, %eax
+; X64-NEXT: andb $7, %cl
+; X64-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NEXT: shrl %cl, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
%tmp = tail call i8 @llvm.fshr.i8(i8 %x, i8 %y, i8 %z)
@@ -65,15 +53,13 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
;
; X86-SLOW-LABEL: var_shift_i16:
; X86-SLOW: # %bb.0:
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %edx
; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %cl
-; X86-SLOW-NEXT: andb $15, %cl
-; X86-SLOW-NEXT: shrl %cl, %edx
-; X86-SLOW-NEXT: addl %eax, %eax
-; X86-SLOW-NEXT: xorb $15, %cl
-; X86-SLOW-NEXT: shll %cl, %eax
+; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT: shll $16, %eax
; X86-SLOW-NEXT: orl %edx, %eax
+; X86-SLOW-NEXT: andb $15, %cl
+; X86-SLOW-NEXT: shrl %cl, %eax
; X86-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
; X86-SLOW-NEXT: retl
;
@@ -90,15 +76,12 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
; X64-SLOW-LABEL: var_shift_i16:
; X64-SLOW: # %bb.0:
; X64-SLOW-NEXT: movl %edx, %ecx
-; X64-SLOW-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-SLOW-NEXT: movzwl %si, %edx
+; X64-SLOW-NEXT: shll $16, %edi
+; X64-SLOW-NEXT: movzwl %si, %eax
+; X64-SLOW-NEXT: orl %edi, %eax
; X64-SLOW-NEXT: andb $15, %cl
-; X64-SLOW-NEXT: shrl %cl, %edx
-; X64-SLOW-NEXT: leal (%rdi,%rdi), %eax
-; X64-SLOW-NEXT: xorb $15, %cl
; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-SLOW-NEXT: shll %cl, %eax
-; X64-SLOW-NEXT: orl %edx, %eax
+; X64-SLOW-NEXT: shrl %cl, %eax
; X64-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
; X64-SLOW-NEXT: retq
%tmp = tail call i16 @llvm.fshr.i16(i16 %x, i16 %y, i16 %z)
diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll
index a705773598b4..9ef29c7883d4 100644
--- a/llvm/test/CodeGen/X86/rotate-extract.ll
+++ b/llvm/test/CodeGen/X86/rotate-extract.ll
@@ -232,31 +232,31 @@ define i8 @no_extract_udiv(i8 %i) nounwind {
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: imull $171, %eax, %ecx
-; X86-NEXT: shlb $3, %ch
-; X86-NEXT: andb $-16, %ch
; X86-NEXT: imull $79, %eax, %edx
; X86-NEXT: subb %dh, %al
; X86-NEXT: shrb %al
; X86-NEXT: addb %dh, %al
; X86-NEXT: shrb $5, %al
-; X86-NEXT: orb %ch, %al
-; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: shlb $3, %ch
+; X86-NEXT: orb %al, %ch
+; X86-NEXT: andb $-9, %ch
+; X86-NEXT: movb %ch, %al
; X86-NEXT: retl
;
; X64-LABEL: no_extract_udiv:
; X64: # %bb.0:
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: imull $171, %eax, %ecx
-; X64-NEXT: shrl $8, %ecx
-; X64-NEXT: shlb $3, %cl
-; X64-NEXT: andb $-16, %cl
-; X64-NEXT: imull $79, %eax, %edx
+; X64-NEXT: movzbl %dil, %ecx
+; X64-NEXT: imull $171, %ecx, %eax
+; X64-NEXT: shrl $8, %eax
+; X64-NEXT: imull $79, %ecx, %edx
; X64-NEXT: shrl $8, %edx
-; X64-NEXT: subb %dl, %al
-; X64-NEXT: shrb %al
-; X64-NEXT: addb %dl, %al
-; X64-NEXT: shrb $5, %al
+; X64-NEXT: subb %dl, %cl
+; X64-NEXT: shrb %cl
+; X64-NEXT: addb %dl, %cl
+; X64-NEXT: shrb $5, %cl
+; X64-NEXT: shlb $3, %al
; X64-NEXT: orb %cl, %al
+; X64-NEXT: andb $-9, %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
%lhs_div = udiv i8 %i, 3
More information about the llvm-commits
mailing list