[llvm] r327172 - Correct load-op-store cycle detection analysis
Nirav Dave via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 9 12:58:07 PST 2018
Author: niravd
Date: Fri Mar 9 12:58:07 2018
New Revision: 327172
URL: http://llvm.org/viewvc/llvm-project?rev=327172&view=rev
Log:
Correct load-op-store cycle detection analysis
Add missing cycle dependency checks in load-op-store fusion.
Fixes PR36274.
Reviewers: craig.topper, bogner
Subscribers: hiraditya, llvm-commits
Differential Revision: https://reviews.llvm.org/D43154
Added:
llvm/trunk/test/CodeGen/X86/pr36274.ll
Modified:
llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp
Modified: llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp?rev=327172&r1=327171&r2=327172&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp Fri Mar 9 12:58:07 2018
@@ -2109,27 +2109,56 @@ static bool isFusableLoadOpStorePattern(
LoadNode->getOffset() != StoreNode->getOffset())
return false;
- // Check if the chain is produced by the load or is a TokenFactor with
- // the load output chain as an operand. Return InputChain by reference.
+ bool FoundLoad = false;
+ SmallVector<SDValue, 4> ChainOps;
+ SmallVector<const SDNode *, 4> LoopWorklist;
+ SmallPtrSet<const SDNode *, 16> Visited;
+ const unsigned int Max = 1024;
+
+ // Visualization of Load-Op-Store fusion:
+ // -------------------------
+ // Legend:
+ // *-lines = Chain operand dependencies.
+ // |-lines = Normal operand dependencies.
+ // Dependencies flow down and right. n-suffix references multiple nodes.
+ //
+ // C Xn C
+ // * * *
+ // * * *
+ // Xn A-LD Yn TF Yn
+ // * * \ | * |
+ // * * \ | * |
+ // * * \ | => A--LD_OP_ST
+ // * * \| \
+ // TF OP \
+ // * | \ Zn
+ // * | \
+ // A-ST Zn
+ //
+
+ // This merge induced dependences from: #1: Xn -> LD, OP, Zn
+ // #2: Yn -> LD
+ // #3: ST -> Zn
+
+ // Ensure the transform is safe by checking for the dual
+ // dependencies to make sure we do not induce a loop.
+
+ // As LD is a predecessor to both OP and ST we can do this by checking:
+ // a). if LD is a predecessor to a member of Xn or Yn.
+ // b). if a Zn is a predecessor to ST.
+
+ // However, (b) can only occur through being a chain predecessor to
+ // ST, which is the same as Zn being a member or predecessor of Xn,
+ // which is a subset of LD being a predecessor of Xn. So it's
+ // subsumed by check (a).
+
SDValue Chain = StoreNode->getChain();
+ // Gather X elements in ChainOps.
if (Chain == Load.getValue(1)) {
- InputChain = LoadNode->getChain();
- return true;
- }
-
- if (Chain.getOpcode() == ISD::TokenFactor) {
- // Fusing Load-Op-Store requires predecessors of store must also
- // be predecessors of the load. This addition may cause a loop. We
- // can check this by doing a search for Load in the new
- // dependencies. As this can be expensive, heuristically prune
- // this search by visiting the uses and make sure they all have
- // smaller node id than the load.
-
- bool FoundLoad = false;
- SmallVector<SDValue, 4> ChainOps;
- SmallVector<const SDNode *, 4> LoopWorklist;
- SmallPtrSet<const SDNode *, 16> Visited;
+ FoundLoad = true;
+ ChainOps.push_back(Load.getOperand(0));
+ } else if (Chain.getOpcode() == ISD::TokenFactor) {
for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
SDValue Op = Chain.getOperand(i);
if (Op == Load.getValue(1)) {
@@ -2141,31 +2170,25 @@ static bool isFusableLoadOpStorePattern(
LoopWorklist.push_back(Op.getNode());
ChainOps.push_back(Op);
}
+ }
- if (!FoundLoad)
- return false;
+ if (!FoundLoad)
+ return false;
- // If Loop Worklist is not empty. Check if we would make a loop.
- if (!LoopWorklist.empty()) {
- const unsigned int Max = 8192;
- // if Load is predecessor to potentially loop inducing chain
- // dependencies.
- if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist,
- Max, true))
- return false;
- // Fail conservatively if we ended loop search early.
- if (Visited.size() >= Max)
- return false;
- }
+ // Worklist is currently Xn. Add Yn to worklist.
+ for (SDValue Op : StoredVal->ops())
+ if (Op.getNode() != LoadNode)
+ LoopWorklist.push_back(Op.getNode());
+
+ // Check (a) if Load is a predecessor to Xn + Yn
+ if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
+ true))
+ return false;
- // Make a new TokenFactor with all the other input chains except
- // for the load.
- InputChain =
- CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
- return true;
+ InputChain =
+ CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
+ return true;
}
- return false;
-}
// Change a chain of {load; op; store} of the same value into a simple op
// through memory of that value, if the uses of the modified value and its
Added: llvm/trunk/test/CodeGen/X86/pr36274.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr36274.ll?rev=327172&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr36274.ll (added)
+++ llvm/trunk/test/CodeGen/X86/pr36274.ll Fri Mar 9 12:58:07 2018
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s
+
+; This tests is checking for a case where the x86 load-op-store fusion
+; misses a dependence between the fused load and a non-fused operand
+; to the load causing a cycle. Here the dependence in question comes
+; from the carry in input of the adcl.
+
+ at vx = external local_unnamed_addr global <2 x i32>, align 8
+
+define void @pr36274(i32* %somewhere) {
+; CHECK-LABEL: pr36274:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl vx+4, %eax
+; CHECK-NEXT: addl $1, vx
+; CHECK-NEXT: adcl $0, %eax
+; CHECK-NEXT: movl %eax, vx+4
+; CHECK-NEXT: retl
+ %a0 = getelementptr <2 x i32>, <2 x i32>* @vx, i32 0, i32 0
+ %a1 = getelementptr <2 x i32>, <2 x i32>* @vx, i32 0, i32 1
+ %x1 = load volatile i32, i32* %a1, align 4
+ %x0 = load volatile i32, i32* %a0, align 8
+ %vx0 = insertelement <2 x i32> undef, i32 %x0, i32 0
+ %vx1 = insertelement <2 x i32> %vx0, i32 %x1, i32 1
+ %x = bitcast <2 x i32> %vx1 to i64
+ %add = add i64 %x, 1
+ %vadd = bitcast i64 %add to <2 x i32>
+ %vx1_0 = extractelement <2 x i32> %vadd, i32 0
+ %vx1_1 = extractelement <2 x i32> %vadd, i32 1
+ store i32 %vx1_0, i32* %a0, align 8
+ store i32 %vx1_1, i32* %a1, align 4
+ ret void
+}
More information about the llvm-commits
mailing list