[llvm] cc65a7a - [X86] Improve i8 + 'slow' i16 funnel shift codegen

Sun May 24 00:09:21 PDT 2020

Author: Simon Pilgrim
Date: 2020-05-24T08:08:53+01:00
New Revision: cc65a7a5ea81f8cb5068e99d9bf407745623c624

URL: https://github.com/llvm/llvm-project/commit/cc65a7a5ea81f8cb5068e99d9bf407745623c624
DIFF: https://github.com/llvm/llvm-project/commit/cc65a7a5ea81f8cb5068e99d9bf407745623c624.diff

LOG: [X86] Improve i8 + 'slow' i16 funnel shift codegen

This is a preliminary patch before I deal with the xor+and issue raised in D77301.

We get much better code for i8/i16 funnel shifts by concatenating the operands together and performing the shift as a double width type, it avoids repeated use of the shift amount and partial registers.

fshl(x,y,z) -> (((zext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
fshr(x,y,z) -> (((zext(x) << bw) | zext(y)) >> (z & (bw-1))) >> bw.

Alive2: http://volta.cs.utah.edu:8080/z/CZx7Cn

This doesn't do as well for i32 cases on x86_64 (the xor+and followup patch is much better) so I haven't bothered with that.

Cases with constant amounts are more dubious as well so I haven't currently bothered with those - its these kind of 'edge' cases that put me off trying to put this in TargetLowering::expandFunnelShift.

Differential Revision: https://reviews.llvm.org/D80466

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/fshl.ll
    llvm/test/CodeGen/X86/fshr.ll
    llvm/test/CodeGen/X86/rotate-extract.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 94e2e106abc7..4d51faecec7f 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -201,6 +201,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     // For slow shld targets we only lower for code size.
     LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
 
+    setOperationAction(ShiftOp             , MVT::i8   , Custom);
     setOperationAction(ShiftOp             , MVT::i16  , Custom);
     setOperationAction(ShiftOp             , MVT::i32  , ShiftDoubleAction);
     if (Subtarget.is64Bit())
@@ -19074,13 +19075,36 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
     return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
                        Op0, Op1, Amt);
   }
-
-  assert((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
-         "Unexpected funnel shift type!");
+  assert(
+      (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
+      "Unexpected funnel shift type!");
 
   // Expand slow SHLD/SHRD cases if we are not optimizing for size.
   bool OptForSize = DAG.shouldOptForSize();
-  if (!OptForSize && Subtarget.isSHLDSlow())
+  bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
+
+  // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
+  // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))) >> bw.
+  if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
+      !isa<ConstantSDNode>(Amt)) {
+    unsigned EltSizeInBits = VT.getScalarSizeInBits();
+    SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
+    SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
+    Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
+    Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
+    Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
+    SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
+    Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
+    if (IsFSHR) {
+      Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
+    } else {
+      Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
+      Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
+    }
+    return DAG.getZExtOrTrunc(Res, DL, VT);
+  }
+
+  if (VT == MVT::i8 || ExpandFunnel)
     return SDValue();
 
   // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.

diff  --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
index 99f5a3e923bb..ce9709136bd8 100644
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -16,37 +16,26 @@ declare i64 @llvm.fshl.i64(i64, i64, i64) nounwind readnone
 define i8 @var_shift_i8(i8 %x, i8 %y, i8 %z) nounwind {
 ; X86-LABEL: var_shift_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    andb $7, %dl
-; X86-NEXT:    movb %al, %ch
-; X86-NEXT:    movb %dl, %cl
-; X86-NEXT:    shlb %cl, %ch
-; X86-NEXT:    movb $8, %cl
-; X86-NEXT:    subb %dl, %cl
-; X86-NEXT:    shrb %cl, %ah
-; X86-NEXT:    testb %dl, %dl
-; X86-NEXT:    je .LBB0_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    orb %ah, %ch
-; X86-NEXT:    movb %ch, %al
-; X86-NEXT:  .LBB0_2:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    andb $7, %cl
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movb %ah, %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: var_shift_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    andb $7, %dl
-; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    shlb %cl, %al
-; X64-NEXT:    movb $8, %cl
-; X64-NEXT:    subb %dl, %cl
-; X64-NEXT:    shrb %cl, %sil
-; X64-NEXT:    orb %al, %sil
+; X64-NEXT:    shll $8, %edi
 ; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    testb %dl, %dl
-; X64-NEXT:    cmovel %edi, %eax
+; X64-NEXT:    orl %edi, %eax
+; X64-NEXT:    andb $7, %cl
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shll %cl, %eax
+; X64-NEXT:    shrl $8, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %tmp = tail call i8 @llvm.fshl.i8(i8 %x, i8 %y, i8 %z)
@@ -65,15 +54,14 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
 ;
 ; X86-SLOW-LABEL: var_shift_i16:
 ; X86-SLOW:       # %bb.0:
-; X86-SLOW-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SLOW-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-SLOW-NEXT:    andb $15, %cl
-; X86-SLOW-NEXT:    shll %cl, %edx
-; X86-SLOW-NEXT:    shrl %eax
-; X86-SLOW-NEXT:    xorb $15, %cl
-; X86-SLOW-NEXT:    shrl %cl, %eax
+; X86-SLOW-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    shll $16, %eax
 ; X86-SLOW-NEXT:    orl %edx, %eax
+; X86-SLOW-NEXT:    andb $15, %cl
+; X86-SLOW-NEXT:    shll %cl, %eax
+; X86-SLOW-NEXT:    shrl $16, %eax
 ; X86-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-SLOW-NEXT:    retl
 ;
@@ -90,14 +78,13 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
 ; X64-SLOW-LABEL: var_shift_i16:
 ; X64-SLOW:       # %bb.0:
 ; X64-SLOW-NEXT:    movl %edx, %ecx
+; X64-SLOW-NEXT:    shll $16, %edi
 ; X64-SLOW-NEXT:    movzwl %si, %eax
+; X64-SLOW-NEXT:    orl %edi, %eax
 ; X64-SLOW-NEXT:    andb $15, %cl
-; X64-SLOW-NEXT:    shll %cl, %edi
-; X64-SLOW-NEXT:    xorb $15, %cl
-; X64-SLOW-NEXT:    shrl %eax
 ; X64-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-SLOW-NEXT:    shrl %cl, %eax
-; X64-SLOW-NEXT:    orl %edi, %eax
+; X64-SLOW-NEXT:    shll %cl, %eax
+; X64-SLOW-NEXT:    shrl $16, %eax
 ; X64-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-SLOW-NEXT:    retq
   %tmp = tail call i16 @llvm.fshl.i16(i16 %x, i16 %y, i16 %z)

diff  --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
index 6e18c13fecb1..7f9d10f2fd2f 100644
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -16,37 +16,25 @@ declare i64 @llvm.fshr.i64(i64, i64, i64) nounwind readnone
 define i8 @var_shift_i8(i8 %x, i8 %y, i8 %z) nounwind {
 ; X86-LABEL: var_shift_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    andb $7, %dl
-; X86-NEXT:    movb %al, %ch
-; X86-NEXT:    movb %dl, %cl
-; X86-NEXT:    shrb %cl, %ch
-; X86-NEXT:    movb $8, %cl
-; X86-NEXT:    subb %dl, %cl
-; X86-NEXT:    shlb %cl, %ah
-; X86-NEXT:    testb %dl, %dl
-; X86-NEXT:    je .LBB0_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    orb %ch, %ah
-; X86-NEXT:    movb %ah, %al
-; X86-NEXT:  .LBB0_2:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    andb $7, %cl
+; X86-NEXT:    shrl %cl, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: var_shift_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    andb $7, %dl
-; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    shrb %cl, %al
-; X64-NEXT:    movb $8, %cl
-; X64-NEXT:    subb %dl, %cl
-; X64-NEXT:    shlb %cl, %dil
-; X64-NEXT:    orb %al, %dil
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    testb %dl, %dl
-; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    shll $8, %edi
+; X64-NEXT:    movzbl %sil, %eax
+; X64-NEXT:    orl %edi, %eax
+; X64-NEXT:    andb $7, %cl
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shrl %cl, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %tmp = tail call i8 @llvm.fshr.i8(i8 %x, i8 %y, i8 %z)
@@ -65,15 +53,13 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
 ;
 ; X86-SLOW-LABEL: var_shift_i16:
 ; X86-SLOW:       # %bb.0:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
 ; X86-SLOW-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-SLOW-NEXT:    andb $15, %cl
-; X86-SLOW-NEXT:    shrl %cl, %edx
-; X86-SLOW-NEXT:    addl %eax, %eax
-; X86-SLOW-NEXT:    xorb $15, %cl
-; X86-SLOW-NEXT:    shll %cl, %eax
+; X86-SLOW-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    shll $16, %eax
 ; X86-SLOW-NEXT:    orl %edx, %eax
+; X86-SLOW-NEXT:    andb $15, %cl
+; X86-SLOW-NEXT:    shrl %cl, %eax
 ; X86-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-SLOW-NEXT:    retl
 ;
@@ -90,15 +76,12 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
 ; X64-SLOW-LABEL: var_shift_i16:
 ; X64-SLOW:       # %bb.0:
 ; X64-SLOW-NEXT:    movl %edx, %ecx
-; X64-SLOW-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-SLOW-NEXT:    movzwl %si, %edx
+; X64-SLOW-NEXT:    shll $16, %edi
+; X64-SLOW-NEXT:    movzwl %si, %eax
+; X64-SLOW-NEXT:    orl %edi, %eax
 ; X64-SLOW-NEXT:    andb $15, %cl
-; X64-SLOW-NEXT:    shrl %cl, %edx
-; X64-SLOW-NEXT:    leal (%rdi,%rdi), %eax
-; X64-SLOW-NEXT:    xorb $15, %cl
 ; X64-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-SLOW-NEXT:    shll %cl, %eax
-; X64-SLOW-NEXT:    orl %edx, %eax
+; X64-SLOW-NEXT:    shrl %cl, %eax
 ; X64-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-SLOW-NEXT:    retq
   %tmp = tail call i16 @llvm.fshr.i16(i16 %x, i16 %y, i16 %z)

diff  --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll
index a705773598b4..9ef29c7883d4 100644
--- a/llvm/test/CodeGen/X86/rotate-extract.ll
+++ b/llvm/test/CodeGen/X86/rotate-extract.ll
@@ -232,31 +232,31 @@ define i8 @no_extract_udiv(i8 %i) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull $171, %eax, %ecx
-; X86-NEXT:    shlb $3, %ch
-; X86-NEXT:    andb $-16, %ch
 ; X86-NEXT:    imull $79, %eax, %edx
 ; X86-NEXT:    subb %dh, %al
 ; X86-NEXT:    shrb %al
 ; X86-NEXT:    addb %dh, %al
 ; X86-NEXT:    shrb $5, %al
-; X86-NEXT:    orb %ch, %al
-; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    shlb $3, %ch
+; X86-NEXT:    orb %al, %ch
+; X86-NEXT:    andb $-9, %ch
+; X86-NEXT:    movb %ch, %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: no_extract_udiv:
 ; X64:       # %bb.0:
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    imull $171, %eax, %ecx
-; X64-NEXT:    shrl $8, %ecx
-; X64-NEXT:    shlb $3, %cl
-; X64-NEXT:    andb $-16, %cl
-; X64-NEXT:    imull $79, %eax, %edx
+; X64-NEXT:    movzbl %dil, %ecx
+; X64-NEXT:    imull $171, %ecx, %eax
+; X64-NEXT:    shrl $8, %eax
+; X64-NEXT:    imull $79, %ecx, %edx
 ; X64-NEXT:    shrl $8, %edx
-; X64-NEXT:    subb %dl, %al
-; X64-NEXT:    shrb %al
-; X64-NEXT:    addb %dl, %al
-; X64-NEXT:    shrb $5, %al
+; X64-NEXT:    subb %dl, %cl
+; X64-NEXT:    shrb %cl
+; X64-NEXT:    addb %dl, %cl
+; X64-NEXT:    shrb $5, %cl
+; X64-NEXT:    shlb $3, %al
 ; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    andb $-9, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %lhs_div = udiv i8 %i, 3