[llvm] 7202d9c - [DAG] Combine fshl/fshr(load1,load0,c) if we have consecutive loads

Fri Mar 6 03:38:40 PST 2020

Author: Simon Pilgrim
Date: 2020-03-06T11:36:18Z
New Revision: 7202d9cde9d23ba94c7f09fd6b5eafc437136542

URL: https://github.com/llvm/llvm-project/commit/7202d9cde9d23ba94c7f09fd6b5eafc437136542
DIFF: https://github.com/llvm/llvm-project/commit/7202d9cde9d23ba94c7f09fd6b5eafc437136542.diff

LOG: [DAG] Combine fshl/fshr(load1,load0,c) if we have consecutive loads

As noted on D75114, if both arguments of a funnel shift are consecutive loads we are missing the opportunity to combine them into a single load.

Differential Revision: https://reviews.llvm.org/D75624

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/test/CodeGen/X86/fshl.ll
    llvm/test/CodeGen/X86/fshr.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index bca7b369fef0..ef3782dd42b5 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8257,6 +8257,43 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
       return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0,
                          DAG.getConstant(IsFSHL ? ShAmt : BitWidth - ShAmt,
                                          SDLoc(N), ShAmtTy));
+
+    // fold (fshl ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.
+    // fold (fshr ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.
+    // TODO - bigendian support once we have test coverage.
+    // TODO - can we merge this with CombineConseutiveLoads/MatchLoadCombine?
+    if ((BitWidth % 8) == 0 && (ShAmt % 8) == 0 && !VT.isVector() &&
+        !DAG.getDataLayout().isBigEndian()) {
+      auto *LHS = dyn_cast<LoadSDNode>(N0);
+      auto *RHS = dyn_cast<LoadSDNode>(N1);
+      if (LHS && RHS && LHS->isSimple() && RHS->isSimple() &&
+          LHS->getAddressSpace() == RHS->getAddressSpace() &&
+          (LHS->hasOneUse() || RHS->hasOneUse()) && ISD::isNON_EXTLoad(RHS)) {
+        if (DAG.areNonVolatileConsecutiveLoads(LHS, RHS, BitWidth / 8, 1)) {
+          SDLoc DL(RHS);
+          uint64_t PtrOff =
+              IsFSHL ? (((BitWidth - ShAmt) % BitWidth) / 8) : (ShAmt / 8);
+          unsigned NewAlign = MinAlign(RHS->getAlignment(), PtrOff);
+          bool Fast = false;
+          if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+                                     RHS->getAddressSpace(), NewAlign,
+                                     RHS->getMemOperand()->getFlags(), &Fast) &&
+              Fast) {
+            SDValue NewPtr =
+                DAG.getMemBasePlusOffset(RHS->getBasePtr(), PtrOff, DL);
+            AddToWorklist(NewPtr.getNode());
+            SDValue Load = DAG.getLoad(
+                VT, DL, RHS->getChain(), NewPtr,
+                RHS->getPointerInfo().getWithOffset(PtrOff), NewAlign,
+                RHS->getMemOperand()->getFlags(), RHS->getAAInfo());
+            // Replace the old load's chain with the new load's chain.
+            WorklistRemover DeadNodes(*this);
+            DAG.ReplaceAllUsesOfValueWith(N1.getValue(1), Load.getValue(1));
+            return Load;
+          }
+        }
+      }
+    }
   }
 
   // fold fshr(undef_or_zero, N1, N2) -> lshr(N1, N2)

diff  --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
index 6d8ccef45d20..465dea578267 100644
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -547,39 +547,16 @@ define i8 @combine_fshl_load_i8(i8* %p) nounwind {
 }
 
 define i16 @combine_fshl_load_i16(i16* %p) nounwind {
-; X86-FAST-LABEL: combine_fshl_load_i16:
-; X86-FAST:       # %bb.0:
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT:    movzwl (%eax), %ecx
-; X86-FAST-NEXT:    movzwl 2(%eax), %eax
-; X86-FAST-NEXT:    shldw $8, %cx, %ax
-; X86-FAST-NEXT:    retl
-;
-; X86-SLOW-LABEL: combine_fshl_load_i16:
-; X86-SLOW:       # %bb.0:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLOW-NEXT:    movzwl 2(%ecx), %eax
-; X86-SLOW-NEXT:    movzbl 1(%ecx), %ecx
-; X86-SLOW-NEXT:    shll $8, %eax
-; X86-SLOW-NEXT:    orl %ecx, %eax
-; X86-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-SLOW-NEXT:    retl
-;
-; X64-FAST-LABEL: combine_fshl_load_i16:
-; X64-FAST:       # %bb.0:
-; X64-FAST-NEXT:    movzwl (%rdi), %ecx
-; X64-FAST-NEXT:    movzwl 2(%rdi), %eax
-; X64-FAST-NEXT:    shldw $8, %cx, %ax
-; X64-FAST-NEXT:    retq
+; X86-LABEL: combine_fshl_load_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl 1(%eax), %eax
+; X86-NEXT:    retl
 ;
-; X64-SLOW-LABEL: combine_fshl_load_i16:
-; X64-SLOW:       # %bb.0:
-; X64-SLOW-NEXT:    movzwl 2(%rdi), %eax
-; X64-SLOW-NEXT:    movzbl 1(%rdi), %ecx
-; X64-SLOW-NEXT:    shll $8, %eax
-; X64-SLOW-NEXT:    orl %ecx, %eax
-; X64-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-SLOW-NEXT:    retq
+; X64-LABEL: combine_fshl_load_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl 1(%rdi), %eax
+; X64-NEXT:    retq
   %p0 = getelementptr i16, i16* %p, i32 0
   %p1 = getelementptr i16, i16* %p, i32 1
   %ld0 = load i16, i16 *%p0
@@ -589,31 +566,16 @@ define i16 @combine_fshl_load_i16(i16* %p) nounwind {
 }
 
 define i32 @combine_fshl_load_i32(i32* %p) nounwind {
-; X86-FAST-LABEL: combine_fshl_load_i32:
-; X86-FAST:       # %bb.0:
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT:    movl 8(%eax), %ecx
-; X86-FAST-NEXT:    movl 12(%eax), %eax
-; X86-FAST-NEXT:    shldl $8, %ecx, %eax
-; X86-FAST-NEXT:    retl
-;
-; X86-SLOW-LABEL: combine_fshl_load_i32:
-; X86-SLOW:       # %bb.0:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT:    movl 11(%eax), %eax
-; X86-SLOW-NEXT:    retl
-;
-; X64-FAST-LABEL: combine_fshl_load_i32:
-; X64-FAST:       # %bb.0:
-; X64-FAST-NEXT:    movl 8(%rdi), %ecx
-; X64-FAST-NEXT:    movl 12(%rdi), %eax
-; X64-FAST-NEXT:    shldl $8, %ecx, %eax
-; X64-FAST-NEXT:    retq
+; X86-LABEL: combine_fshl_load_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl 11(%eax), %eax
+; X86-NEXT:    retl
 ;
-; X64-SLOW-LABEL: combine_fshl_load_i32:
-; X64-SLOW:       # %bb.0:
-; X64-SLOW-NEXT:    movl 11(%rdi), %eax
-; X64-SLOW-NEXT:    retq
+; X64-LABEL: combine_fshl_load_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl 11(%rdi), %eax
+; X64-NEXT:    retq
   %p0 = getelementptr i32, i32* %p, i32 2
   %p1 = getelementptr i32, i32* %p, i32 3
   %ld0 = load i32, i32 *%p0
@@ -652,21 +614,10 @@ define i64 @combine_fshl_load_i64(i64* %p) nounwind {
 ; X86-SLOW-NEXT:    popl %esi
 ; X86-SLOW-NEXT:    retl
 ;
-; X64-FAST-LABEL: combine_fshl_load_i64:
-; X64-FAST:       # %bb.0:
-; X64-FAST-NEXT:    movq 8(%rdi), %rcx
-; X64-FAST-NEXT:    movq 16(%rdi), %rax
-; X64-FAST-NEXT:    shldq $24, %rcx, %rax
-; X64-FAST-NEXT:    retq
-;
-; X64-SLOW-LABEL: combine_fshl_load_i64:
-; X64-SLOW:       # %bb.0:
-; X64-SLOW-NEXT:    movq 8(%rdi), %rcx
-; X64-SLOW-NEXT:    movq 16(%rdi), %rax
-; X64-SLOW-NEXT:    shrq $40, %rcx
-; X64-SLOW-NEXT:    shlq $24, %rax
-; X64-SLOW-NEXT:    orq %rcx, %rax
-; X64-SLOW-NEXT:    retq
+; X64-LABEL: combine_fshl_load_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    movq 13(%rdi), %rax
+; X64-NEXT:    retq
   %p0 = getelementptr i64, i64* %p, i64 1
   %p1 = getelementptr i64, i64* %p, i64 2
   %ld0 = load i64, i64 *%p0

diff  --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
index f977576ce73c..644e33fe198c 100644
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -542,39 +542,16 @@ define i8 @combine_fshr_load_i8(i8* %p) nounwind {
 }
 
 define i16 @combine_fshr_load_i16(i16* %p) nounwind {
-; X86-FAST-LABEL: combine_fshr_load_i16:
-; X86-FAST:       # %bb.0:
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT:    movzwl (%eax), %ecx
-; X86-FAST-NEXT:    movzwl 2(%eax), %eax
-; X86-FAST-NEXT:    shldw $8, %cx, %ax
-; X86-FAST-NEXT:    retl
-;
-; X86-SLOW-LABEL: combine_fshr_load_i16:
-; X86-SLOW:       # %bb.0:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLOW-NEXT:    movzwl 2(%ecx), %eax
-; X86-SLOW-NEXT:    movzbl 1(%ecx), %ecx
-; X86-SLOW-NEXT:    shll $8, %eax
-; X86-SLOW-NEXT:    orl %ecx, %eax
-; X86-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-SLOW-NEXT:    retl
-;
-; X64-FAST-LABEL: combine_fshr_load_i16:
-; X64-FAST:       # %bb.0:
-; X64-FAST-NEXT:    movzwl (%rdi), %ecx
-; X64-FAST-NEXT:    movzwl 2(%rdi), %eax
-; X64-FAST-NEXT:    shldw $8, %cx, %ax
-; X64-FAST-NEXT:    retq
+; X86-LABEL: combine_fshr_load_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl 1(%eax), %eax
+; X86-NEXT:    retl
 ;
-; X64-SLOW-LABEL: combine_fshr_load_i16:
-; X64-SLOW:       # %bb.0:
-; X64-SLOW-NEXT:    movzwl 2(%rdi), %eax
-; X64-SLOW-NEXT:    movzbl 1(%rdi), %ecx
-; X64-SLOW-NEXT:    shll $8, %eax
-; X64-SLOW-NEXT:    orl %ecx, %eax
-; X64-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-SLOW-NEXT:    retq
+; X64-LABEL: combine_fshr_load_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl 1(%rdi), %eax
+; X64-NEXT:    retq
   %p0 = getelementptr i16, i16* %p, i32 0
   %p1 = getelementptr i16, i16* %p, i32 1
   %ld0 = load i16, i16 *%p0
@@ -584,39 +561,16 @@ define i16 @combine_fshr_load_i16(i16* %p) nounwind {
 }
 
 define i32 @combine_fshr_load_i32(i32* %p) nounwind {
-; X86-FAST-LABEL: combine_fshr_load_i32:
-; X86-FAST:       # %bb.0:
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT:    movl 8(%eax), %ecx
-; X86-FAST-NEXT:    movl 12(%eax), %eax
-; X86-FAST-NEXT:    shldl $24, %ecx, %eax
-; X86-FAST-NEXT:    retl
-;
-; X86-SLOW-LABEL: combine_fshr_load_i32:
-; X86-SLOW:       # %bb.0:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT:    movl 8(%eax), %ecx
-; X86-SLOW-NEXT:    movl 12(%eax), %eax
-; X86-SLOW-NEXT:    shrl $8, %ecx
-; X86-SLOW-NEXT:    shll $24, %eax
-; X86-SLOW-NEXT:    orl %ecx, %eax
-; X86-SLOW-NEXT:    retl
-;
-; X64-FAST-LABEL: combine_fshr_load_i32:
-; X64-FAST:       # %bb.0:
-; X64-FAST-NEXT:    movl 8(%rdi), %ecx
-; X64-FAST-NEXT:    movl 12(%rdi), %eax
-; X64-FAST-NEXT:    shldl $24, %ecx, %eax
-; X64-FAST-NEXT:    retq
+; X86-LABEL: combine_fshr_load_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl 9(%eax), %eax
+; X86-NEXT:    retl
 ;
-; X64-SLOW-LABEL: combine_fshr_load_i32:
-; X64-SLOW:       # %bb.0:
-; X64-SLOW-NEXT:    movl 8(%rdi), %ecx
-; X64-SLOW-NEXT:    movl 12(%rdi), %eax
-; X64-SLOW-NEXT:    shrl $8, %ecx
-; X64-SLOW-NEXT:    shll $24, %eax
-; X64-SLOW-NEXT:    orl %ecx, %eax
-; X64-SLOW-NEXT:    retq
+; X64-LABEL: combine_fshr_load_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl 9(%rdi), %eax
+; X64-NEXT:    retq
   %p0 = getelementptr i32, i32* %p, i32 2
   %p1 = getelementptr i32, i32* %p, i32 3
   %ld0 = load i32, i32 *%p0
@@ -656,21 +610,10 @@ define i64 @combine_fshr_load_i64(i64* %p) nounwind {
 ; X86-SLOW-NEXT:    popl %esi
 ; X86-SLOW-NEXT:    retl
 ;
-; X64-FAST-LABEL: combine_fshr_load_i64:
-; X64-FAST:       # %bb.0:
-; X64-FAST-NEXT:    movq 8(%rdi), %rcx
-; X64-FAST-NEXT:    movq 16(%rdi), %rax
-; X64-FAST-NEXT:    shldq $40, %rcx, %rax
-; X64-FAST-NEXT:    retq
-;
-; X64-SLOW-LABEL: combine_fshr_load_i64:
-; X64-SLOW:       # %bb.0:
-; X64-SLOW-NEXT:    movq 8(%rdi), %rcx
-; X64-SLOW-NEXT:    movq 16(%rdi), %rax
-; X64-SLOW-NEXT:    shrq $24, %rcx
-; X64-SLOW-NEXT:    shlq $40, %rax
-; X64-SLOW-NEXT:    orq %rcx, %rax
-; X64-SLOW-NEXT:    retq
+; X64-LABEL: combine_fshr_load_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    movq 11(%rdi), %rax
+; X64-NEXT:    retq
   %p0 = getelementptr i64, i64* %p, i64 1
   %p1 = getelementptr i64, i64* %p, i64 2
   %ld0 = load i64, i64 *%p0