[llvm] c5fd9e3 - [DAG] Don't permit EXTLOAD when combining FSHL/FSHR consecutive loads (PR45265)

Sat Mar 21 03:53:00 PDT 2020

Author: Simon Pilgrim
Date: 2020-03-21T10:52:41Z
New Revision: c5fd9e3888d5e8f849a872949b5891f6ea2eba56

URL: https://github.com/llvm/llvm-project/commit/c5fd9e3888d5e8f849a872949b5891f6ea2eba56
DIFF: https://github.com/llvm/llvm-project/commit/c5fd9e3888d5e8f849a872949b5891f6ea2eba56.diff

LOG: [DAG] Don't permit EXTLOAD when combining FSHL/FSHR consecutive loads (PR45265)

Technically we can permit EXTLOAD of the LHS operand but only if all the extended bits are shifted out. Until we test coverage for that case, I'm just disabling this to fix PR45265.

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/test/CodeGen/X86/funnel-shift.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 7a68d18ef9b5..a6537a0bf97e 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8325,13 +8325,15 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
     // fold (fshr ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.
     // TODO - bigendian support once we have test coverage.
     // TODO - can we merge this with CombineConseutiveLoads/MatchLoadCombine?
+    // TODO - permit LHS EXTLOAD if extensions are shifted out.
     if ((BitWidth % 8) == 0 && (ShAmt % 8) == 0 && !VT.isVector() &&
         !DAG.getDataLayout().isBigEndian()) {
       auto *LHS = dyn_cast<LoadSDNode>(N0);
       auto *RHS = dyn_cast<LoadSDNode>(N1);
       if (LHS && RHS && LHS->isSimple() && RHS->isSimple() &&
           LHS->getAddressSpace() == RHS->getAddressSpace() &&
-          (LHS->hasOneUse() || RHS->hasOneUse()) && ISD::isNON_EXTLoad(RHS)) {
+          (LHS->hasOneUse() || RHS->hasOneUse()) && ISD::isNON_EXTLoad(RHS) &&
+          ISD::isNON_EXTLoad(LHS)) {
         if (DAG.areNonVolatileConsecutiveLoads(LHS, RHS, BitWidth / 8, 1)) {
           SDLoc DL(RHS);
           uint64_t PtrOff =

diff  --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index 517880fb88e5..f78fe2c00eb3 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -918,3 +918,67 @@ define <4 x i32> @fshr_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) nounw
   ret <4 x i32> %f
 }
 
+%struct.S = type { [11 x i8], i8 }
+define void @PR45265(i32 %0, %struct.S* nocapture readonly %1) nounwind {
+; X32-SSE2-LABEL: PR45265:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    pushl %edi
+; X32-SSE2-NEXT:    pushl %esi
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE2-NEXT:    leal (%eax,%eax,2), %edx
+; X32-SSE2-NEXT:    movzwl 8(%ecx,%edx,4), %esi
+; X32-SSE2-NEXT:    movsbl 10(%ecx,%edx,4), %edi
+; X32-SSE2-NEXT:    shll $16, %edi
+; X32-SSE2-NEXT:    orl %edi, %esi
+; X32-SSE2-NEXT:    movl 4(%ecx,%edx,4), %ecx
+; X32-SSE2-NEXT:    shrdl $8, %esi, %ecx
+; X32-SSE2-NEXT:    xorl %eax, %ecx
+; X32-SSE2-NEXT:    sarl $31, %eax
+; X32-SSE2-NEXT:    sarl $31, %edi
+; X32-SSE2-NEXT:    shldl $24, %esi, %edi
+; X32-SSE2-NEXT:    xorl %eax, %edi
+; X32-SSE2-NEXT:    orl %edi, %ecx
+; X32-SSE2-NEXT:    jne .LBB44_1
+; X32-SSE2-NEXT:  # %bb.2:
+; X32-SSE2-NEXT:    popl %esi
+; X32-SSE2-NEXT:    popl %edi
+; X32-SSE2-NEXT:    jmp _Z3foov # TAILCALL
+; X32-SSE2-NEXT:  .LBB44_1:
+; X32-SSE2-NEXT:    popl %esi
+; X32-SSE2-NEXT:    popl %edi
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: PR45265:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    movslq %edi, %rax
+; X64-AVX2-NEXT:    leaq (%rax,%rax,2), %rcx
+; X64-AVX2-NEXT:    movsbq 10(%rsi,%rcx,4), %rdx
+; X64-AVX2-NEXT:    shlq $16, %rdx
+; X64-AVX2-NEXT:    movzwl 8(%rsi,%rcx,4), %edi
+; X64-AVX2-NEXT:    orq %rdx, %rdi
+; X64-AVX2-NEXT:    movq (%rsi,%rcx,4), %rcx
+; X64-AVX2-NEXT:    shrdq $40, %rdi, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    jne .LBB44_1
+; X64-AVX2-NEXT:  # %bb.2:
+; X64-AVX2-NEXT:    jmp _Z3foov # TAILCALL
+; X64-AVX2-NEXT:  .LBB44_1:
+; X64-AVX2-NEXT:    retq
+  %3 = sext i32 %0 to i64
+  %4 = getelementptr inbounds %struct.S, %struct.S* %1, i64 %3
+  %5 = bitcast %struct.S* %4 to i88*
+  %6 = load i88, i88* %5, align 1
+  %7 = ashr i88 %6, 40
+  %8 = trunc i88 %7 to i64
+  %9 = icmp eq i64 %8, %3
+  br i1 %9, label %10, label %11
+
+10:
+  tail call void @_Z3foov()
+  br label %11
+
+11:
+  ret void
+}
+declare dso_local void @_Z3foov()