[llvm] 7202d9c - [DAG] Combine fshl/fshr(load1,load0,c) if we have consecutive loads
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 6 03:38:40 PST 2020
Author: Simon Pilgrim
Date: 2020-03-06T11:36:18Z
New Revision: 7202d9cde9d23ba94c7f09fd6b5eafc437136542
URL: https://github.com/llvm/llvm-project/commit/7202d9cde9d23ba94c7f09fd6b5eafc437136542
DIFF: https://github.com/llvm/llvm-project/commit/7202d9cde9d23ba94c7f09fd6b5eafc437136542.diff
LOG: [DAG] Combine fshl/fshr(load1,load0,c) if we have consecutive loads
As noted on D75114, if both arguments of a funnel shift are consecutive loads we are missing the opportunity to combine them into a single load.
Differential Revision: https://reviews.llvm.org/D75624
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/X86/fshl.ll
llvm/test/CodeGen/X86/fshr.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index bca7b369fef0..ef3782dd42b5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8257,6 +8257,43 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0,
DAG.getConstant(IsFSHL ? ShAmt : BitWidth - ShAmt,
SDLoc(N), ShAmtTy));
+
+ // fold (fshl ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.
+ // fold (fshr ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.
+ // TODO - bigendian support once we have test coverage.
+ // TODO - can we merge this with CombineConseutiveLoads/MatchLoadCombine?
+ if ((BitWidth % 8) == 0 && (ShAmt % 8) == 0 && !VT.isVector() &&
+ !DAG.getDataLayout().isBigEndian()) {
+ auto *LHS = dyn_cast<LoadSDNode>(N0);
+ auto *RHS = dyn_cast<LoadSDNode>(N1);
+ if (LHS && RHS && LHS->isSimple() && RHS->isSimple() &&
+ LHS->getAddressSpace() == RHS->getAddressSpace() &&
+ (LHS->hasOneUse() || RHS->hasOneUse()) && ISD::isNON_EXTLoad(RHS)) {
+ if (DAG.areNonVolatileConsecutiveLoads(LHS, RHS, BitWidth / 8, 1)) {
+ SDLoc DL(RHS);
+ uint64_t PtrOff =
+ IsFSHL ? (((BitWidth - ShAmt) % BitWidth) / 8) : (ShAmt / 8);
+ unsigned NewAlign = MinAlign(RHS->getAlignment(), PtrOff);
+ bool Fast = false;
+ if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+ RHS->getAddressSpace(), NewAlign,
+ RHS->getMemOperand()->getFlags(), &Fast) &&
+ Fast) {
+ SDValue NewPtr =
+ DAG.getMemBasePlusOffset(RHS->getBasePtr(), PtrOff, DL);
+ AddToWorklist(NewPtr.getNode());
+ SDValue Load = DAG.getLoad(
+ VT, DL, RHS->getChain(), NewPtr,
+ RHS->getPointerInfo().getWithOffset(PtrOff), NewAlign,
+ RHS->getMemOperand()->getFlags(), RHS->getAAInfo());
+ // Replace the old load's chain with the new load's chain.
+ WorklistRemover DeadNodes(*this);
+ DAG.ReplaceAllUsesOfValueWith(N1.getValue(1), Load.getValue(1));
+ return Load;
+ }
+ }
+ }
+ }
}
// fold fshr(undef_or_zero, N1, N2) -> lshr(N1, N2)
diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
index 6d8ccef45d20..465dea578267 100644
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -547,39 +547,16 @@ define i8 @combine_fshl_load_i8(i8* %p) nounwind {
}
define i16 @combine_fshl_load_i16(i16* %p) nounwind {
-; X86-FAST-LABEL: combine_fshl_load_i16:
-; X86-FAST: # %bb.0:
-; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT: movzwl (%eax), %ecx
-; X86-FAST-NEXT: movzwl 2(%eax), %eax
-; X86-FAST-NEXT: shldw $8, %cx, %ax
-; X86-FAST-NEXT: retl
-;
-; X86-SLOW-LABEL: combine_fshl_load_i16:
-; X86-SLOW: # %bb.0:
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SLOW-NEXT: movzwl 2(%ecx), %eax
-; X86-SLOW-NEXT: movzbl 1(%ecx), %ecx
-; X86-SLOW-NEXT: shll $8, %eax
-; X86-SLOW-NEXT: orl %ecx, %eax
-; X86-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
-; X86-SLOW-NEXT: retl
-;
-; X64-FAST-LABEL: combine_fshl_load_i16:
-; X64-FAST: # %bb.0:
-; X64-FAST-NEXT: movzwl (%rdi), %ecx
-; X64-FAST-NEXT: movzwl 2(%rdi), %eax
-; X64-FAST-NEXT: shldw $8, %cx, %ax
-; X64-FAST-NEXT: retq
+; X86-LABEL: combine_fshl_load_i16:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movzwl 1(%eax), %eax
+; X86-NEXT: retl
;
-; X64-SLOW-LABEL: combine_fshl_load_i16:
-; X64-SLOW: # %bb.0:
-; X64-SLOW-NEXT: movzwl 2(%rdi), %eax
-; X64-SLOW-NEXT: movzbl 1(%rdi), %ecx
-; X64-SLOW-NEXT: shll $8, %eax
-; X64-SLOW-NEXT: orl %ecx, %eax
-; X64-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
-; X64-SLOW-NEXT: retq
+; X64-LABEL: combine_fshl_load_i16:
+; X64: # %bb.0:
+; X64-NEXT: movzwl 1(%rdi), %eax
+; X64-NEXT: retq
%p0 = getelementptr i16, i16* %p, i32 0
%p1 = getelementptr i16, i16* %p, i32 1
%ld0 = load i16, i16 *%p0
@@ -589,31 +566,16 @@ define i16 @combine_fshl_load_i16(i16* %p) nounwind {
}
define i32 @combine_fshl_load_i32(i32* %p) nounwind {
-; X86-FAST-LABEL: combine_fshl_load_i32:
-; X86-FAST: # %bb.0:
-; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT: movl 8(%eax), %ecx
-; X86-FAST-NEXT: movl 12(%eax), %eax
-; X86-FAST-NEXT: shldl $8, %ecx, %eax
-; X86-FAST-NEXT: retl
-;
-; X86-SLOW-LABEL: combine_fshl_load_i32:
-; X86-SLOW: # %bb.0:
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT: movl 11(%eax), %eax
-; X86-SLOW-NEXT: retl
-;
-; X64-FAST-LABEL: combine_fshl_load_i32:
-; X64-FAST: # %bb.0:
-; X64-FAST-NEXT: movl 8(%rdi), %ecx
-; X64-FAST-NEXT: movl 12(%rdi), %eax
-; X64-FAST-NEXT: shldl $8, %ecx, %eax
-; X64-FAST-NEXT: retq
+; X86-LABEL: combine_fshl_load_i32:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 11(%eax), %eax
+; X86-NEXT: retl
;
-; X64-SLOW-LABEL: combine_fshl_load_i32:
-; X64-SLOW: # %bb.0:
-; X64-SLOW-NEXT: movl 11(%rdi), %eax
-; X64-SLOW-NEXT: retq
+; X64-LABEL: combine_fshl_load_i32:
+; X64: # %bb.0:
+; X64-NEXT: movl 11(%rdi), %eax
+; X64-NEXT: retq
%p0 = getelementptr i32, i32* %p, i32 2
%p1 = getelementptr i32, i32* %p, i32 3
%ld0 = load i32, i32 *%p0
@@ -652,21 +614,10 @@ define i64 @combine_fshl_load_i64(i64* %p) nounwind {
; X86-SLOW-NEXT: popl %esi
; X86-SLOW-NEXT: retl
;
-; X64-FAST-LABEL: combine_fshl_load_i64:
-; X64-FAST: # %bb.0:
-; X64-FAST-NEXT: movq 8(%rdi), %rcx
-; X64-FAST-NEXT: movq 16(%rdi), %rax
-; X64-FAST-NEXT: shldq $24, %rcx, %rax
-; X64-FAST-NEXT: retq
-;
-; X64-SLOW-LABEL: combine_fshl_load_i64:
-; X64-SLOW: # %bb.0:
-; X64-SLOW-NEXT: movq 8(%rdi), %rcx
-; X64-SLOW-NEXT: movq 16(%rdi), %rax
-; X64-SLOW-NEXT: shrq $40, %rcx
-; X64-SLOW-NEXT: shlq $24, %rax
-; X64-SLOW-NEXT: orq %rcx, %rax
-; X64-SLOW-NEXT: retq
+; X64-LABEL: combine_fshl_load_i64:
+; X64: # %bb.0:
+; X64-NEXT: movq 13(%rdi), %rax
+; X64-NEXT: retq
%p0 = getelementptr i64, i64* %p, i64 1
%p1 = getelementptr i64, i64* %p, i64 2
%ld0 = load i64, i64 *%p0
diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
index f977576ce73c..644e33fe198c 100644
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -542,39 +542,16 @@ define i8 @combine_fshr_load_i8(i8* %p) nounwind {
}
define i16 @combine_fshr_load_i16(i16* %p) nounwind {
-; X86-FAST-LABEL: combine_fshr_load_i16:
-; X86-FAST: # %bb.0:
-; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT: movzwl (%eax), %ecx
-; X86-FAST-NEXT: movzwl 2(%eax), %eax
-; X86-FAST-NEXT: shldw $8, %cx, %ax
-; X86-FAST-NEXT: retl
-;
-; X86-SLOW-LABEL: combine_fshr_load_i16:
-; X86-SLOW: # %bb.0:
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SLOW-NEXT: movzwl 2(%ecx), %eax
-; X86-SLOW-NEXT: movzbl 1(%ecx), %ecx
-; X86-SLOW-NEXT: shll $8, %eax
-; X86-SLOW-NEXT: orl %ecx, %eax
-; X86-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
-; X86-SLOW-NEXT: retl
-;
-; X64-FAST-LABEL: combine_fshr_load_i16:
-; X64-FAST: # %bb.0:
-; X64-FAST-NEXT: movzwl (%rdi), %ecx
-; X64-FAST-NEXT: movzwl 2(%rdi), %eax
-; X64-FAST-NEXT: shldw $8, %cx, %ax
-; X64-FAST-NEXT: retq
+; X86-LABEL: combine_fshr_load_i16:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movzwl 1(%eax), %eax
+; X86-NEXT: retl
;
-; X64-SLOW-LABEL: combine_fshr_load_i16:
-; X64-SLOW: # %bb.0:
-; X64-SLOW-NEXT: movzwl 2(%rdi), %eax
-; X64-SLOW-NEXT: movzbl 1(%rdi), %ecx
-; X64-SLOW-NEXT: shll $8, %eax
-; X64-SLOW-NEXT: orl %ecx, %eax
-; X64-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
-; X64-SLOW-NEXT: retq
+; X64-LABEL: combine_fshr_load_i16:
+; X64: # %bb.0:
+; X64-NEXT: movzwl 1(%rdi), %eax
+; X64-NEXT: retq
%p0 = getelementptr i16, i16* %p, i32 0
%p1 = getelementptr i16, i16* %p, i32 1
%ld0 = load i16, i16 *%p0
@@ -584,39 +561,16 @@ define i16 @combine_fshr_load_i16(i16* %p) nounwind {
}
define i32 @combine_fshr_load_i32(i32* %p) nounwind {
-; X86-FAST-LABEL: combine_fshr_load_i32:
-; X86-FAST: # %bb.0:
-; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT: movl 8(%eax), %ecx
-; X86-FAST-NEXT: movl 12(%eax), %eax
-; X86-FAST-NEXT: shldl $24, %ecx, %eax
-; X86-FAST-NEXT: retl
-;
-; X86-SLOW-LABEL: combine_fshr_load_i32:
-; X86-SLOW: # %bb.0:
-; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT: movl 8(%eax), %ecx
-; X86-SLOW-NEXT: movl 12(%eax), %eax
-; X86-SLOW-NEXT: shrl $8, %ecx
-; X86-SLOW-NEXT: shll $24, %eax
-; X86-SLOW-NEXT: orl %ecx, %eax
-; X86-SLOW-NEXT: retl
-;
-; X64-FAST-LABEL: combine_fshr_load_i32:
-; X64-FAST: # %bb.0:
-; X64-FAST-NEXT: movl 8(%rdi), %ecx
-; X64-FAST-NEXT: movl 12(%rdi), %eax
-; X64-FAST-NEXT: shldl $24, %ecx, %eax
-; X64-FAST-NEXT: retq
+; X86-LABEL: combine_fshr_load_i32:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 9(%eax), %eax
+; X86-NEXT: retl
;
-; X64-SLOW-LABEL: combine_fshr_load_i32:
-; X64-SLOW: # %bb.0:
-; X64-SLOW-NEXT: movl 8(%rdi), %ecx
-; X64-SLOW-NEXT: movl 12(%rdi), %eax
-; X64-SLOW-NEXT: shrl $8, %ecx
-; X64-SLOW-NEXT: shll $24, %eax
-; X64-SLOW-NEXT: orl %ecx, %eax
-; X64-SLOW-NEXT: retq
+; X64-LABEL: combine_fshr_load_i32:
+; X64: # %bb.0:
+; X64-NEXT: movl 9(%rdi), %eax
+; X64-NEXT: retq
%p0 = getelementptr i32, i32* %p, i32 2
%p1 = getelementptr i32, i32* %p, i32 3
%ld0 = load i32, i32 *%p0
@@ -656,21 +610,10 @@ define i64 @combine_fshr_load_i64(i64* %p) nounwind {
; X86-SLOW-NEXT: popl %esi
; X86-SLOW-NEXT: retl
;
-; X64-FAST-LABEL: combine_fshr_load_i64:
-; X64-FAST: # %bb.0:
-; X64-FAST-NEXT: movq 8(%rdi), %rcx
-; X64-FAST-NEXT: movq 16(%rdi), %rax
-; X64-FAST-NEXT: shldq $40, %rcx, %rax
-; X64-FAST-NEXT: retq
-;
-; X64-SLOW-LABEL: combine_fshr_load_i64:
-; X64-SLOW: # %bb.0:
-; X64-SLOW-NEXT: movq 8(%rdi), %rcx
-; X64-SLOW-NEXT: movq 16(%rdi), %rax
-; X64-SLOW-NEXT: shrq $24, %rcx
-; X64-SLOW-NEXT: shlq $40, %rax
-; X64-SLOW-NEXT: orq %rcx, %rax
-; X64-SLOW-NEXT: retq
+; X64-LABEL: combine_fshr_load_i64:
+; X64: # %bb.0:
+; X64-NEXT: movq 11(%rdi), %rax
+; X64-NEXT: retq
%p0 = getelementptr i64, i64* %p, i64 1
%p1 = getelementptr i64, i64* %p, i64 2
%ld0 = load i64, i64 *%p0
More information about the llvm-commits
mailing list