[llvm] r239472 - [StatepointLowering] Reuse stack slots across basic blocks

Wed Jun 10 05:31:54 PDT 2015

Author: igor.laevsky
Date: Wed Jun 10 07:31:53 2015
New Revision: 239472

URL: http://llvm.org/viewvc/llvm-project?rev=239472&view=rev
Log:
[StatepointLowering] Reuse stack slots across basic blocks

During statepoint lowering we can sometimes avoid spilling of the value if we know that it was already spilled for previous statepoint.
We were doing this by checking if incoming statepoint value was lowered into load from stack slot. This was working only in boundaries of one basic block.

But instead of looking at the lowered node we can look directly at the llvm-ir value and if it was gc.relocate (or some simple modification of it) look up stack slot for it's derived pointer and reuse stack slot from it. This allows us to look across basic block boundaries.

Differential Revision: http://reviews.llvm.org/D10251


Modified:
    llvm/trunk/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
    llvm/trunk/test/CodeGen/X86/statepoint-stack-usage.ll

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/StatepointLowering.cpp?rev=239472&r1=239471&r2=239472&view=diff
==============================================================================

--- llvm/trunk/lib/CodeGen/SelectionDAG/StatepointLowering.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/StatepointLowering.cpp Wed Jun 10 07:31:53 2015
@@ -113,84 +113,137 @@ StatepointLoweringState::allocateStackSl
   llvm_unreachable("infinite loop?");
 }
 
+/// Utility function for reservePreviousStackSlotForValue. Tries to find
+/// stack slot index to which we have spilled value for previous statepoints.
+/// LookUpDepth specifies maximum DFS depth this function is allowed to look.
+static Optional<int> findPreviousSpillSlot(const Value *Val,
+                                           SelectionDAGBuilder &Builder,
+                                           int LookUpDepth) {
+  // Can not look any futher - give up now
+  if (LookUpDepth <= 0)
+    return Optional<int>();
+
+  // Spill location is known for gc relocates
+  if (isGCRelocate(Val)) {
+    GCRelocateOperands RelocOps(cast<Instruction>(Val));
+
+    FunctionLoweringInfo::StatepointSpilledValueMapTy &SpillMap =
+        Builder.FuncInfo.StatepointRelocatedValues[RelocOps.getStatepoint()];
+
+    auto It = SpillMap.find(RelocOps.getDerivedPtr());
+    if (It == SpillMap.end())
+      return Optional<int>();
+
+    return It->second;
+  }
+
+  // Look through bitcast instructions.
+  if (const BitCastInst *Cast = dyn_cast<BitCastInst>(Val)) {
+    return findPreviousSpillSlot(Cast->getOperand(0), Builder, LookUpDepth - 1);
+  }
+
+  // Look through phi nodes
+  // All incoming values should have same known stack slot, otherwise result
+  // is unknown.
+  if (const PHINode *Phi = dyn_cast<PHINode>(Val)) {
+    Optional<int> MergedResult = None;
+
+    for (auto &IncomingValue : Phi->incoming_values()) {
+      Optional<int> SpillSlot =
+          findPreviousSpillSlot(IncomingValue, Builder, LookUpDepth - 1);
+      if (!SpillSlot.hasValue())
+        return Optional<int>();
+
+      if (MergedResult.hasValue() && *MergedResult != *SpillSlot)
+        return Optional<int>();
+
+      MergedResult = SpillSlot;
+    }
+    return MergedResult;
+  }
+
+  // TODO: We can do better for PHI nodes. In cases like this:
+  //   ptr = phi(relocated_pointer, not_relocated_pointer)
+  //   statepoint(ptr)
+  // We will return that stack slot for ptr is unknown. And later we might
+  // assign different stack slots for ptr and relocated_pointer. This limits
+  // llvm's ability to remove redundant stores.
+  // Unfortunately it's hard to accomplish in current infrastructure.
+  // We use this function to eliminate spill store completely, while
+  // in example we still need to emit store, but instead of any location
+  // we need to use special "preferred" location.
+
+  // TODO: handle simple updates.  If a value is modified and the original
+  // value is no longer live, it would be nice to put the modified value in the
+  // same slot.  This allows folding of the memory accesses for some
+  // instructions types (like an increment).
+  //   statepoint (i)
+  //   i1 = i+1
+  //   statepoint (i1)
+  // However we need to be careful for cases like this:
+  //   statepoint(i)
+  //   i1 = i+1
+  //   statepoint(i, i1)
+  // Here we want to reserve spill slot for 'i', but not for 'i+1'. If we just
+  // put handling of simple modifications in this function like it's done
+  // for bitcasts we might end up reserving i's slot for 'i+1' because order in
+  // which we visit values is unspecified.
+
+  // Don't know any information about this instruction
+  return Optional<int>();
+}
+
 /// Try to find existing copies of the incoming values in stack slots used for
 /// statepoint spilling.  If we can find a spill slot for the incoming value,
 /// mark that slot as allocated, and reuse the same slot for this safepoint.
 /// This helps to avoid series of loads and stores that only serve to resuffle
 /// values on the stack between calls.
-static void reservePreviousStackSlotForValue(SDValue Incoming,
+static void reservePreviousStackSlotForValue(const Value *IncomingValue,
                                              SelectionDAGBuilder &Builder) {
 
+  SDValue Incoming = Builder.getValue(IncomingValue);
+
   if (isa<ConstantSDNode>(Incoming) || isa<FrameIndexSDNode>(Incoming)) {
     // We won't need to spill this, so no need to check for previously
     // allocated stack slots
     return;
   }
 
-  SDValue Loc = Builder.StatepointLowering.getLocation(Incoming);
-  if (Loc.getNode()) {
+  SDValue OldLocation = Builder.StatepointLowering.getLocation(Incoming);
+  if (OldLocation.getNode())
     // duplicates in input
     return;
-  }
 
-  // Search back for the load from a stack slot pattern to find the original
-  // slot we allocated for this value.  We could extend this to deal with
-  // simple modification patterns, but simple dealing with trivial load/store
-  // sequences helps a lot already.
-  if (LoadSDNode *Load = dyn_cast<LoadSDNode>(Incoming)) {
-    if (auto *FI = dyn_cast<FrameIndexSDNode>(Load->getBasePtr())) {
-      const int Index = FI->getIndex();
-      auto Itr = std::find(Builder.FuncInfo.StatepointStackSlots.begin(),
-                           Builder.FuncInfo.StatepointStackSlots.end(), Index);
-      if (Itr == Builder.FuncInfo.StatepointStackSlots.end()) {
-        // not one of the lowering stack slots, can't reuse!
-        // TODO: Actually, we probably could reuse the stack slot if the value
-        // hasn't changed at all, but we'd need to look for intervening writes
-        return;
-      } else {
-        // This is one of our dedicated lowering slots
-        const int Offset =
-            std::distance(Builder.FuncInfo.StatepointStackSlots.begin(), Itr);
-        if (Builder.StatepointLowering.isStackSlotAllocated(Offset)) {
-          // stack slot already assigned to someone else, can't use it!
-          // TODO: currently we reserve space for gc arguments after doing
-          // normal allocation for deopt arguments.  We should reserve for
-          // _all_ deopt and gc arguments, then start allocating.  This
-          // will prevent some moves being inserted when vm state changes,
-          // but gc state doesn't between two calls.
-          return;
-        }
-        // Reserve this stack slot
-        Builder.StatepointLowering.reserveStackSlot(Offset);
-      }
-
-      // Cache this slot so we find it when going through the normal
-      // assignment loop.
-      SDValue Loc =
-          Builder.DAG.getTargetFrameIndex(Index, Incoming.getValueType());
+  const int LookUpDepth = 6;
+  Optional<int> Index =
+      findPreviousSpillSlot(IncomingValue, Builder, LookUpDepth);
+  if (!Index.hasValue())
+    return;
 
-      Builder.StatepointLowering.setLocation(Incoming, Loc);
-    }
+  auto Itr = std::find(Builder.FuncInfo.StatepointStackSlots.begin(),
+                       Builder.FuncInfo.StatepointStackSlots.end(), *Index);
+  assert(Itr != Builder.FuncInfo.StatepointStackSlots.end() &&
+         "value spilled to the unknown stack slot");
+
+  // This is one of our dedicated lowering slots
+  const int Offset =
+      std::distance(Builder.FuncInfo.StatepointStackSlots.begin(), Itr);
+  if (Builder.StatepointLowering.isStackSlotAllocated(Offset)) {
+    // stack slot already assigned to someone else, can't use it!
+    // TODO: currently we reserve space for gc arguments after doing
+    // normal allocation for deopt arguments.  We should reserve for
+    // _all_ deopt and gc arguments, then start allocating.  This
+    // will prevent some moves being inserted when vm state changes,
+    // but gc state doesn't between two calls.
+    return;
   }
+  // Reserve this stack slot
+  Builder.StatepointLowering.reserveStackSlot(Offset);
 
-  // TODO: handle case where a reloaded value flows through a phi to
-  // another safepoint.  e.g.
-  // bb1:
-  //  a' = relocated...
-  // bb2: % pred: bb1, bb3, bb4, etc.
-  //  a_phi = phi(a', ...)
-  // statepoint ... a_phi
-  // NOTE: This will require reasoning about cross basic block values.  This is
-  // decidedly non trivial and this might not be the right place to do it.  We
-  // don't really have the information we need here...
-
-  // TODO: handle simple updates.  If a value is modified and the original
-  // value is no longer live, it would be nice to put the modified value in the
-  // same slot.  This allows folding of the memory accesses for some
-  // instructions types (like an increment).
-  // statepoint (i)
-  // i1 = i+1
-  // statepoint (i1)
+  // Cache this slot so we find it when going through the normal
+  // assignment loop.
+  SDValue Loc = Builder.DAG.getTargetFrameIndex(*Index, Incoming.getValueType());
+  Builder.StatepointLowering.setLocation(Incoming, Loc);
 }
 
 /// Remove any duplicate (as SDValues) from the derived pointer pairs.  This
@@ -458,15 +511,11 @@ static void lowerStatepointMetaArgs(Smal
   // doesn't change semantics at all.  It is important for performance that we
   // reserve slots for both deopt and gc values before lowering either.
   for (const Value *V : StatepointSite.vm_state_args()) {
-    SDValue Incoming = Builder.getValue(V);
-    reservePreviousStackSlotForValue(Incoming, Builder);
+    reservePreviousStackSlotForValue(V, Builder);
   }
   for (unsigned i = 0; i < Bases.size(); ++i) {
-    const Value *Base = Bases[i];
-    reservePreviousStackSlotForValue(Builder.getValue(Base), Builder);
-
-    const Value *Ptr = Ptrs[i];
-    reservePreviousStackSlotForValue(Builder.getValue(Ptr), Builder);
+    reservePreviousStackSlotForValue(Bases[i], Builder);
+    reservePreviousStackSlotForValue(Ptrs[i], Builder);
   }
 
   // First, prefix the list with the number of unique values to be

Modified: llvm/trunk/test/CodeGen/X86/statepoint-stack-usage.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/statepoint-stack-usage.ll?rev=239472&r1=239471&r2=239472&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/statepoint-stack-usage.ll (original)
+++ llvm/trunk/test/CodeGen/X86/statepoint-stack-usage.ll Wed Jun 10 07:31:53 2015
@@ -52,9 +52,53 @@ define i32 @reserve_first(i32 addrspace(
   ret i32 1
 }
 
+; Test that stack slots are reused for invokes
+define i32 @back_to_back_invokes(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 gc "statepoint-example" {
+; CHECK-LABEL: back_to_back_invokes
+entry:
+  ; The exact stores don't matter, but there need to be three stack slots created
+  ; CHECK: movq	%rdi, 16(%rsp)
+  ; CHECK: movq	%rdx, 8(%rsp)
+  ; CHECK: movq	%rsi, (%rsp)
+  ; CHECK: callq
+  %safepoint_token = invoke i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
+                   to label %normal_return unwind label %exceptional_return
+
+normal_return:
+  %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 12, i32 12)
+  %b1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 12, i32 13)
+  %c1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 12, i32 14)
+  ; Should work even through bitcasts
+  %c1.casted = bitcast i32 addrspace(1)* %c1 to i8 addrspace(1)*
+  ; This is the key check.  There should NOT be any memory moves here
+  ; CHECK-NOT: movq
+  ; CHECK: callq
+  %safepoint_token2 = invoke i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i8 addrspace(1)* %c1.casted, i32 addrspace(1)* %b1, i32 addrspace(1)* %a1)
+                    to label %normal_return2 unwind label %exceptional_return2
+
+normal_return2:
+  %a2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 12, i32 14)
+  %b2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 12, i32 13)
+  %c2 = tail call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(i32 %safepoint_token2, i32 12, i32 12)
+  ret i32 1
+
+exceptional_return:
+  %landing_pad = landingpad { i8*, i32 } personality i32 ()* @"personality_function"
+          cleanup
+  ret i32 0
+
+exceptional_return2:
+  %landing_pad2 = landingpad { i8*, i32 } personality i32 ()* @"personality_function"
+          cleanup
+  ret i32 0
+}
+
 ; Function Attrs: nounwind
 declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32, i32, i32) #3
+declare i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(i32, i32, i32) #3
 
 declare i32 @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
 
-attributes #1 = { uwtable }
\ No newline at end of file
+declare i32 @"personality_function"()
+
+attributes #1 = { uwtable }