[llvm] a027570 - [Codegen/Statepoint] Allow usage of registers for non gc deopt values.

Thu Apr 9 02:58:21 PDT 2020

Author: Serguei Katkov
Date: 2020-04-09T16:57:35+07:00
New Revision: a0275705bb5aa938119c3e7c8bc957a823450b17

URL: https://github.com/llvm/llvm-project/commit/a0275705bb5aa938119c3e7c8bc957a823450b17
DIFF: https://github.com/llvm/llvm-project/commit/a0275705bb5aa938119c3e7c8bc957a823450b17.diff

LOG: [Codegen/Statepoint] Allow usage of registers for non gc deopt values.

The change introduces the usage of physical registers for non-gc deopt values.
This require runtime support to know how to take a value from register.
By default usage is off and can be switched on by option.

The change also introduces additional fix-up patch which forces the spilling
of caller saved registers (clobbered after the call) and re-writes statepoint
to use spill slots instead of caller saved registers.

Reviewers: reames, dantrushin
Reviewed By: reames, dantrushin
Subscribers: mgorny, hiraditya, mgrang, llvm-commits
Differential Revision: https://reviews.llvm.org/D77371

Added: 
    llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
    llvm/test/CodeGen/X86/statepoint-regs.ll

Modified: 
    llvm/include/llvm/CodeGen/Passes.h
    llvm/include/llvm/InitializePasses.h
    llvm/lib/CodeGen/CMakeLists.txt
    llvm/lib/CodeGen/CodeGen.cpp
    llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
    llvm/lib/CodeGen/TargetPassConfig.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 7627baa2f059..b255447e2a98 100644

--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -475,6 +475,10 @@ namespace llvm {
 
   /// Creates MIR Debugify pass. \see MachineDebugify.cpp
   ModulePass *createDebugifyMachineModulePass();
+
+  /// The pass fixups statepoint machine instruction to replace usage of
+  /// caller saved registers with stack slots.
+  extern char &FixupStatepointCallerSavedID;
 } // End llvm namespace
 
 #endif

diff  --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index a71079fdc2b5..34a21541556a 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -152,6 +152,7 @@ void initializeExternalAAWrapperPassPass(PassRegistry&);
 void initializeFEntryInserterPass(PassRegistry&);
 void initializeFinalizeISelPass(PassRegistry&);
 void initializeFinalizeMachineBundlesPass(PassRegistry&);
+void initializeFixupStatepointCallerSavedPass(PassRegistry&);
 void initializeFlattenCFGPassPass(PassRegistry&);
 void initializeFloat2IntLegacyPassPass(PassRegistry&);
 void initializeForceFunctionAttrsLegacyPassPass(PassRegistry&);

diff  --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index 38187897fb66..0bee58051b2e 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -30,6 +30,7 @@ add_llvm_component_library(LLVMCodeGen
   FaultMaps.cpp
   FEntryInserter.cpp
   FinalizeISel.cpp
+  FixupStatepointCallerSaved.cpp
   FuncletLayout.cpp
   GCMetadata.cpp
   GCMetadataPrinter.cpp

diff  --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index f3b596a8f2bb..6237b8dd77e0 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -39,6 +39,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeFEntryInserterPass(Registry);
   initializeFinalizeISelPass(Registry);
   initializeFinalizeMachineBundlesPass(Registry);
+  initializeFixupStatepointCallerSavedPass(Registry);
   initializeFuncletLayoutPass(Registry);
   initializeGCMachineCodeAnalysisPass(Registry);
   initializeGCModuleInfoPass(Registry);

diff  --git a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
new file mode 100644
index 000000000000..29478e9eeec9
--- /dev/null
+++ b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
@@ -0,0 +1,310 @@
+//===-- FixupStatepointCallerSaved.cpp - Fixup caller saved registers  ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Statepoint instruction in deopt parameters contains values which are
+/// meaningful to the runtime and should be able to be read at the moment the
+/// call returns. So we can say that we need to encode the fact that these
+/// values are "late read" by runtime. If we could express this notion for
+/// register allocator it would produce the right form for us.
+/// The need to fixup (i.e this pass) is specifically handling the fact that
+/// we cannot describe such a late read for the register allocator.
+/// Register allocator may put the value on a register clobbered by the call.
+/// This pass forces the spill of such registers and replaces corresponding
+/// statepoint operands to added spill slots.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "fixup-statepoint-caller-saved"
+STATISTIC(NumSpilledRegisters, "Number of spilled register");
+STATISTIC(NumSpillSlotsAllocated, "Number of spill slots allocated");
+STATISTIC(NumSpillSlotsExtended, "Number of spill slots extended");
+
+static cl::opt<bool> FixupSCSExtendSlotSize(
+    "fixup-scs-extend-slot-size", cl::Hidden, cl::init(false),
+    cl::desc("Allow spill in spill slot of greater size than register size"),
+    cl::Hidden);
+
+namespace {
+
+class FixupStatepointCallerSaved : public MachineFunctionPass {
+public:
+  static char ID;
+
+  FixupStatepointCallerSaved() : MachineFunctionPass(ID) {
+    initializeFixupStatepointCallerSavedPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override {
+    return "Fixup Statepoint Caller Saved";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // End anonymous namespace.
+
+char FixupStatepointCallerSaved::ID = 0;
+char &llvm::FixupStatepointCallerSavedID = FixupStatepointCallerSaved::ID;
+
+INITIALIZE_PASS_BEGIN(FixupStatepointCallerSaved, DEBUG_TYPE,
+                      "Fixup Statepoint Caller Saved", false, false)
+INITIALIZE_PASS_END(FixupStatepointCallerSaved, DEBUG_TYPE,
+                    "Fixup Statepoint Caller Saved", false, false)
+
+// Utility function to get size of the register.
+static unsigned getRegisterSize(const TargetRegisterInfo &TRI, Register Reg) {
+  const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg);
+  return TRI.getSpillSize(*RC);
+}
+
+// Cache used frame indexes during statepoint re-write to re-use them in
+// processing next statepoint instruction.
+// Two strategies. One is to preserve the size of spill slot while another one
+// extends the size of spill slots to reduce the number of them, causing
+// the less total frame size. But unspill will have "implicit" any extend.
+class FrameIndexesCache {
+private:
+  struct FrameIndexesPerSize {
+    // List of used frame indexes during processing previous statepoints.
+    SmallVector<int, 8> Slots;
+    // Current index of un-used yet frame index.
+    unsigned Index = 0;
+  };
+  MachineFrameInfo &MFI;
+  const TargetRegisterInfo &TRI;
+  // Map size to list of frame indexes of this size. If the mode is
+  // FixupSCSExtendSlotSize then the key 0 is used to keep all frame indexes.
+  // If the size of required spill slot is greater than in a cache then the
+  // size will be increased.
+  DenseMap<unsigned, FrameIndexesPerSize> Cache;
+
+public:
+  FrameIndexesCache(MachineFrameInfo &MFI, const TargetRegisterInfo &TRI)
+      : MFI(MFI), TRI(TRI) {}
+  // Reset the current state of used frame indexes. After invocation of
+  // this function all frame indexes are available for allocation.
+  void reset() {
+    for (auto &It : Cache)
+      It.second.Index = 0;
+  }
+  // Get frame index to spill the register.
+  int getFrameIndex(Register Reg) {
+    unsigned Size = getRegisterSize(TRI, Reg);
+    // In FixupSCSExtendSlotSize mode the bucket with 0 index is used
+    // for all sizes.
+    unsigned Bucket = FixupSCSExtendSlotSize ? 0 : Size;
+    FrameIndexesPerSize &Line = Cache[Bucket];
+    if (Line.Index < Line.Slots.size()) {
+      int FI = Line.Slots[Line.Index++];
+      // If all sizes are kept together we probably need to extend the
+      // spill slot size.
+      if (MFI.getObjectSize(FI) < Size) {
+        MFI.setObjectSize(FI, Size);
+        MFI.setObjectAlignment(FI, Align(Size));
+        NumSpillSlotsExtended++;
+      }
+      return FI;
+    }
+    int FI = MFI.CreateSpillStackObject(Size, Size);
+    NumSpillSlotsAllocated++;
+    Line.Slots.push_back(FI);
+    ++Line.Index;
+    return FI;
+  }
+  // Sort all registers to spill in descendent order. In the
+  // FixupSCSExtendSlotSize mode it will minimize the total frame size.
+  // In non FixupSCSExtendSlotSize mode we can skip this step.
+  void sortRegisters(SmallVectorImpl<Register> &Regs) {
+    if (!FixupSCSExtendSlotSize)
+      return;
+    llvm::sort(Regs.begin(), Regs.end(), [&](Register &A, Register &B) {
+      return getRegisterSize(TRI, A) > getRegisterSize(TRI, B);
+    });
+  }
+};
+
+// Describes the state of the current processing statepoint instruction.
+class StatepointState {
+private:
+  // statepoint instruction.
+  MachineInstr &MI;
+  MachineFunction &MF;
+  const TargetRegisterInfo &TRI;
+  const TargetInstrInfo &TII;
+  MachineFrameInfo &MFI;
+  // Mask with callee saved registers.
+  const uint32_t *Mask;
+  // Cache of frame indexes used on previous instruction processing.
+  FrameIndexesCache &CacheFI;
+  // Operands with physical registers requiring spilling.
+  SmallVector<unsigned, 8> OpsToSpill;
+  // Set of register to spill.
+  SmallVector<Register, 8> RegsToSpill;
+  // Map Register to Frame Slot index.
+  DenseMap<Register, int> RegToSlotIdx;
+
+public:
+  StatepointState(MachineInstr &MI, const uint32_t *Mask,
+                  FrameIndexesCache &CacheFI)
+      : MI(MI), MF(*MI.getMF()), TRI(*MF.getSubtarget().getRegisterInfo()),
+        TII(*MF.getSubtarget().getInstrInfo()), MFI(MF.getFrameInfo()),
+        Mask(Mask), CacheFI(CacheFI) {}
+  // Return true if register is callee saved.
+  bool isCalleeSaved(Register Reg) { return (Mask[Reg / 32] >> Reg % 32) & 1; }
+  // Iterates over statepoint meta args to find caller saver registers.
+  // Also cache the size of found registers.
+  // Returns true if caller save registers found.
+  bool findRegistersToSpill() {
+    SmallSet<Register, 8> VisitedRegs;
+    for (unsigned Idx = StatepointOpers(&MI).getVarIdx(),
+                  EndIdx = MI.getNumOperands();
+         Idx < EndIdx; ++Idx) {
+      MachineOperand &MO = MI.getOperand(Idx);
+      if (!MO.isReg() || MO.isImplicit())
+        continue;
+      Register Reg = MO.getReg();
+      assert(Reg.isPhysical() && "Only physical regs are expected");
+      if (isCalleeSaved(Reg))
+        continue;
+      if (VisitedRegs.insert(Reg).second)
+        RegsToSpill.push_back(Reg);
+      OpsToSpill.push_back(Idx);
+    }
+    CacheFI.sortRegisters(RegsToSpill);
+    return !RegsToSpill.empty();
+  }
+  // Spill all caller saved registers right before statepoint instruction.
+  // Remember frame index where register is spilled.
+  void spillRegisters() {
+    for (Register Reg : RegsToSpill) {
+      int FI = CacheFI.getFrameIndex(Reg);
+      const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg);
+      TII.storeRegToStackSlot(*MI.getParent(), MI, Reg, true /*is_Kill*/, FI,
+                              RC, &TRI);
+      NumSpilledRegisters++;
+      RegToSlotIdx[Reg] = FI;
+    }
+  }
+  // Re-write statepoint machine instruction to replace caller saved operands
+  // with indirect memory location (frame index).
+  void rewriteStatepoint() {
+    MachineInstr *NewMI =
+        MF.CreateMachineInstr(TII.get(MI.getOpcode()), MI.getDebugLoc(), true);
+    MachineInstrBuilder MIB(MF, NewMI);
+
+    // Add End marker.
+    OpsToSpill.push_back(MI.getNumOperands());
+    unsigned CurOpIdx = 0;
+
+    for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
+      MachineOperand &MO = MI.getOperand(I);
+      if (I == OpsToSpill[CurOpIdx]) {
+        int FI = RegToSlotIdx[MO.getReg()];
+        MIB.addImm(StackMaps::IndirectMemRefOp);
+        MIB.addImm(getRegisterSize(TRI, MO.getReg()));
+        assert(MO.isReg() && "Should be register");
+        assert(MO.getReg().isPhysical() && "Should be physical register");
+        MIB.addFrameIndex(FI);
+        MIB.addImm(0);
+        ++CurOpIdx;
+      } else
+        MIB.add(MO);
+    }
+    assert(CurOpIdx == (OpsToSpill.size() - 1) && "Not all operands processed");
+    // Add mem operands.
+    NewMI->setMemRefs(MF, MI.memoperands());
+    for (auto It : RegToSlotIdx) {
+      int FrameIndex = It.second;
+      auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+      auto *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
+                                          getRegisterSize(TRI, It.first),
+                                          MFI.getObjectAlign(FrameIndex));
+      NewMI->addMemOperand(MF, MMO);
+    }
+    // Insert new statepoint and erase old one.
+    MI.getParent()->insert(MI, NewMI);
+    MI.eraseFromParent();
+  }
+};
+
+class StatepointProcessor {
+private:
+  MachineFunction &MF;
+  const TargetRegisterInfo &TRI;
+  FrameIndexesCache CacheFI;
+
+public:
+  StatepointProcessor(MachineFunction &MF)
+      : MF(MF), TRI(*MF.getSubtarget().getRegisterInfo()),
+        CacheFI(MF.getFrameInfo(), TRI) {}
+
+  bool process(MachineInstr &MI) {
+    unsigned VarIdx = StatepointOpers(&MI).getVarIdx();
+    uint64_t Flags =
+        MI.getOperand(VarIdx + StatepointOpers::FlagsOffset).getImm();
+    // Do nothing for LiveIn, it supports all registers.
+    if (Flags & (uint64_t)StatepointFlags::DeoptLiveIn)
+      return false;
+    CallingConv::ID CC =
+        MI.getOperand(VarIdx + StatepointOpers::CCOffset).getImm();
+    const uint32_t *Mask = TRI.getCallPreservedMask(MF, CC);
+    CacheFI.reset();
+    StatepointState SS(MI, Mask, CacheFI);
+
+    if (!SS.findRegistersToSpill())
+      return false;
+
+    SS.spillRegisters();
+    SS.rewriteStatepoint();
+    return true;
+  }
+};
+
+bool FixupStatepointCallerSaved::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  const Function &F = MF.getFunction();
+  if (!F.hasGC())
+    return false;
+
+  SmallVector<MachineInstr *, 16> Statepoints;
+  for (MachineBasicBlock &BB : MF)
+    for (MachineInstr &I : BB)
+      if (I.getOpcode() == TargetOpcode::STATEPOINT)
+        Statepoints.push_back(&I);
+
+  if (Statepoints.empty())
+    return false;
+
+  bool Changed = false;
+  StatepointProcessor SPP(MF);
+  for (MachineInstr *I : Statepoints)
+    Changed |= SPP.process(*I);
+  return Changed;
+}

diff  --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index ffe35637af28..5c26b11397b9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -61,6 +61,10 @@ STATISTIC(NumOfStatepoints, "Number of statepoint nodes encountered");
 STATISTIC(StatepointMaxSlotsRequired,
           "Maximum number of stack slots required for a singe statepoint");
 
+cl::opt<bool> UseRegistersForDeoptValues(
+    "use-registers-for-deopt-values", cl::Hidden, cl::init(false),
+    cl::desc("Allow using registers for non pointer deopt args"));
+
 static void pushStackMapConstant(SmallVectorImpl<SDValue>& Ops,
                                  SelectionDAGBuilder &Builder, uint64_t Value) {
   SDLoc L = Builder.getCurSDLoc();
@@ -409,7 +413,8 @@ lowerIncomingStatepointValue(SDValue Incoming, bool RequireSpillSlot,
     // end up folding some of these into stack references, but they'll be
     // handled by the register allocator.  Note that we do not have the notion
     // of a late use so these values might be placed in registers which are
-    // clobbered by the call.  This is fine for live-in.
+    // clobbered by the call.  This is fine for live-in. For live-through
+    // fix-up pass should be executed to force spilling of such registers.
     Ops.push_back(Incoming);
   } else {
     // Otherwise, locate a spill slot and explicitly spill it so it
@@ -494,7 +499,7 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
   };
 
   auto requireSpillSlot = [&](const Value *V) {
-    return !LiveInDeopt || isGCValue(V);
+    return !(LiveInDeopt || UseRegistersForDeoptValues) || isGCValue(V);
   };
 
   // Before we actually start lowering (and allocating spill slots for values),

diff  --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index a351efc6659d..7463caf2fd00 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -204,6 +204,8 @@ static cl::opt<std::string>
                   cl::desc("Stop compilation before a specific pass"),
                   cl::value_desc("pass-name"), cl::init(""), cl::Hidden);
 
+extern cl::opt<bool> UseRegistersForDeoptValues;
+
 /// Allow standard passes to be disabled by command line options. This supports
 /// simple binary flags that either suppress the pass or do nothing.
 /// i.e. -disable-mypass=false has no effect.
@@ -911,6 +913,9 @@ void TargetPassConfig::addMachinePasses() {
   // Run post-ra passes.
   addPostRegAlloc();
 
+  if (UseRegistersForDeoptValues)
+    addPass(&FixupStatepointCallerSavedID);
+
   // Insert prolog/epilog code.  Eliminate abstract frame index references...
   if (getOptLevel() != CodeGenOpt::None) {
     addPass(&PostRAMachineSinkingID);

diff  --git a/llvm/test/CodeGen/X86/statepoint-regs.ll b/llvm/test/CodeGen/X86/statepoint-regs.ll
new file mode 100644
index 000000000000..e3333bd51ab7
--- /dev/null
+++ b/llvm/test/CodeGen/X86/statepoint-regs.ll
@@ -0,0 +1,679 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -O3 -use-registers-for-deopt-values -restrict-statepoint-remat=true < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+declare void @bar() #0
+declare void @baz()
+
+; Spill caller saved register for %a.
+define void @test1(i32 %a) gc "statepoint-example" {
+; CHECK-LABEL: test1:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    callq _bar ## 4-byte Folded Reload
+; CHECK-NEXT:  Ltmp0:
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+entry:
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 1, i32 %a)
+  ret void
+}
+
+; Callee save registers are ok.
+define void @test2(i32 %a, i32 %b) gc "statepoint-example" {
+; CHECK-LABEL: test2:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset %rbx, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    callq _bar
+; CHECK-NEXT:  Ltmp1:
+; CHECK-NEXT:    callq _bar
+; CHECK-NEXT:  Ltmp2:
+; CHECK-NEXT:    addq $8, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    retq
+entry:
+  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 2, i32 %a, i32 %b)
+  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 2, i32 %b, i32 %a)
+  ret void
+}
+
+; Arguments in caller saved registers, so they must be spilled.
+define void @test3(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i) gc "statepoint-example" {
+; CHECK-LABEL: test3:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    movl %r9d, (%rsp) ## 4-byte Spill
+; CHECK-NEXT:    callq _bar ## 24-byte Folded Reload
+; CHECK-NEXT:  Ltmp3:
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    retq
+entry:
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 9, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i)
+  ret void
+}
+
+; This case just confirms that we don't crash when given more live values
+; than registers.  This is a case where we *have* to use a stack slot.  This
+; also ends up being a good test of whether we can fold loads from immutable
+; stack slots into the statepoint.
+define void @test4(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z) gc "statepoint-example" {
+; CHECK-LABEL: test4:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    movl %r9d, (%rsp) ## 4-byte Spill
+; CHECK-NEXT:    callq _bar ## 24-byte Folded Reload
+; CHECK-NEXT:  Ltmp4:
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    retq
+entry:
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 26, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)
+  ret void
+}
+
+; A gc-value must be spilled even if it is also a deopt value.
+define  i32 addrspace(1)* @test5(i32 %a, i32 addrspace(1)* %p) gc "statepoint-example" {
+; CHECK-LABEL: test5:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    subq $16, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset %rbx, -16
+; CHECK-NEXT:    movl %edi, %ebx
+; CHECK-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    callq _bar
+; CHECK-NEXT:  Ltmp5:
+; CHECK-NEXT:    callq _bar
+; CHECK-NEXT:  Ltmp6:
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT:    addq $16, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+entry:
+  %token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 1, i32 %a, i32 addrspace(1)* %p, i32 addrspace(1)* %p)
+  %p2 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %token,  i32 9, i32 9)
+  %token2 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 1, i32 %a, i32 addrspace(1)* %p2, i32 addrspace(1)* %p2)
+  %p3 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %token2,  i32 9, i32 9)
+  ret i32 addrspace(1)* %p3
+}
+
+; Callee saved are ok again.
+define void @test6(i32 %a) gc "statepoint-example" {
+; CHECK-LABEL: test6:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset %rbx, -16
+; CHECK-NEXT:    movl %edi, %ebx
+; CHECK-NEXT:    callq _baz
+; CHECK-NEXT:  Ltmp7:
+; CHECK-NEXT:    callq _bar
+; CHECK-NEXT:  Ltmp8:
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+entry:
+  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @baz, i32 0, i32 0, i32 0, i32 1, i32 %a)
+  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 1, i32 %a)
+  ret void
+}
+
+; Many deopt values.
+define void @test7(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z) gc "statepoint-example" {
+; The code for this is terrible, check simply for correctness for the moment
+; CHECK-LABEL: test7:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    subq $168, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 224
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl %edi, %edi
+; CHECK-NEXT:    movl %esi, %esi
+; CHECK-NEXT:    movl %edx, %edx
+; CHECK-NEXT:    movl %ecx, %ecx
+; CHECK-NEXT:    movl %r8d, %r8d
+; CHECK-NEXT:    movl %r9d, %r9d
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ebp
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %r13d
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %r12d
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %r15d
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    callq _bar ## 160-byte Folded Reload
+; CHECK-NEXT:  Ltmp9:
+; CHECK-NEXT:    addq $168, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    retq
+entry:
+  %a64 = zext i32 %a to i64
+  %b64 = zext i32 %b to i64
+  %c64 = zext i32 %c to i64
+  %d64 = zext i32 %d to i64
+  %e64 = zext i32 %e to i64
+  %f64 = zext i32 %f to i64
+  %g64 = zext i32 %g to i64
+  %h64 = zext i32 %h to i64
+  %i64 = zext i32 %i to i64
+  %j64 = zext i32 %j to i64
+  %k64 = zext i32 %k to i64
+  %l64 = zext i32 %l to i64
+  %m64 = zext i32 %m to i64
+  %n64 = zext i32 %n to i64
+  %o64 = zext i32 %o to i64
+  %p64 = zext i32 %p to i64
+  %q64 = zext i32 %q to i64
+  %r64 = zext i32 %r to i64
+  %s64 = zext i32 %s to i64
+  %t64 = zext i32 %t to i64
+  %u64 = zext i32 %u to i64
+  %v64 = zext i32 %v to i64
+  %w64 = zext i32 %w to i64
+  %x64 = zext i32 %x to i64
+  %y64 = zext i32 %y to i64
+  %z64 = zext i32 %z to i64
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 26, i64 %a64, i64 %b64, i64 %c64, i64 %d64, i64 %e64, i64 %f64, i64 %g64, i64 %h64, i64 %i64, i64 %j64, i64 %k64, i64 %l64, i64 %m64, i64 %n64, i64 %o64, i64 %p64, i64 %q64, i64 %r64, i64 %s64, i64 %t64, i64 %u64, i64 %v64, i64 %w64, i64 %x64, i64 %y64, i64 %z64)
+  ret void
+}
+
+; a variant of test7 with mixed types chosen to exercise register aliases
+define void @test8(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z) gc "statepoint-example" {
+; The code for this is terrible, check simply for correctness for the moment
+; CHECK-LABEL: test8:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    subq $104, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 160
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl %r9d, %r10d
+; CHECK-NEXT:    movl %r8d, %r9d
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ebp
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %r13d
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %r12d
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %r15d
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %r8d
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) ## 1-byte Spill
+; CHECK-NEXT:    movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) ## 1-byte Spill
+; CHECK-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) ## 1-byte Spill
+; CHECK-NEXT:    movb %cl, (%rsp) ## 1-byte Spill
+; CHECK-NEXT:    movw %r9w, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; CHECK-NEXT:    movw %r10w, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    callq _bar ## 104-byte Folded Reload
+; CHECK-NEXT:  Ltmp10:
+; CHECK-NEXT:    addq $104, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    retq
+entry:
+  %a8 = trunc i32 %a to i8
+  %b8 = trunc i32 %b to i8
+  %c8 = trunc i32 %c to i8
+  %d8 = trunc i32 %d to i8
+  %e16 = trunc i32 %e to i16
+  %f16 = trunc i32 %f to i16
+  %g16 = trunc i32 %g to i16
+  %h16 = trunc i32 %h to i16
+  %i64 = zext i32 %i to i64
+  %j64 = zext i32 %j to i64
+  %k64 = zext i32 %k to i64
+  %l64 = zext i32 %l to i64
+  %m64 = zext i32 %m to i64
+  %n64 = zext i32 %n to i64
+  %o64 = zext i32 %o to i64
+  %p64 = zext i32 %p to i64
+  %q64 = zext i32 %q to i64
+  %r64 = zext i32 %r to i64
+  %s64 = zext i32 %s to i64
+  %t64 = zext i32 %t to i64
+  %u64 = zext i32 %u to i64
+  %v64 = zext i32 %v to i64
+  %w64 = zext i32 %w to i64
+  %x64 = zext i32 %x to i64
+  %y64 = zext i32 %y to i64
+  %z64 = zext i32 %z to i64
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 26, i8 %a8, i8 %b8, i8 %c8, i8 %d8, i16 %e16, i16 %f16, i16 %g16, i16 %h16, i64 %i64, i64 %j64, i64 %k64, i64 %l64, i64 %m64, i64 %n64, i64 %o64, i64 %p64, i64 %q64, i64 %r64, i64 %s64, i64 %t64, i64 %u64, i64 %v64, i64 %w64, i64 %x64, i64 %y64, i64 %z64)
+  ret void
+}
+
+; Test perfect forwarding of argument registers and stack slots to the
+; deopt bundle uses
+define void @test9(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z) gc "statepoint-example" {
+; CHECK-LABEL: test9:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    movl %r9d, (%rsp) ## 4-byte Spill
+; CHECK-NEXT:    callq _bar ## 24-byte Folded Reload
+; CHECK-NEXT:  Ltmp11:
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    retq
+
+entry:
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 26, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)
+  ret void
+}
+
+; Test enough folding of argument slots when we have one call which clobbers
+; registers before a second which needs them - i.e. we must do something with
+; arguments originally passed in registers
+define void @test10(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z) gc "statepoint-example" {
+; FIXME (minor): It would be better to just spill (and fold reload) for
+; argument registers then spill and fill all the CSRs.
+; CHECK-LABEL: test10:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl %r9d, %r15d
+; CHECK-NEXT:    movl %r8d, %r14d
+; CHECK-NEXT:    movl %ecx, %r12d
+; CHECK-NEXT:    movl %edx, %r13d
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    callq _bar
+; CHECK-NEXT:  Ltmp12:
+; CHECK-NEXT:    callq _bar
+; CHECK-NEXT:  Ltmp13:
+; CHECK-NEXT:    addq $8, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    retq
+
+entry:
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 26, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)
+  %statepoint_token2 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i32 0, i32 26, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)
+  ret void
+}
+
+; Check that we can remat some uses of a def despite not remating before the
+; statepoint user.
+define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z) gc "statepoint-example" {
+; FIXME: The codegen for this is correct, but horrible.  Lots of room for
+; improvement if we so desire.
+; CHECK-LABEL: test11:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    subq $168, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 224
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl %edi, %ebx
+; CHECK-NEXT:    movl %esi, %r15d
+; CHECK-NEXT:    movl %edx, %r12d
+; CHECK-NEXT:    movl %ecx, %r13d
+; CHECK-NEXT:    movl %r8d, %ebp
+; CHECK-NEXT:    movl %r9d, %r14d
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    callq _bar ## 160-byte Folded Reload
+; CHECK-NEXT:  Ltmp14:
+; CHECK-NEXT:    addq %r15, %rbx
+; CHECK-NEXT:    addq %r12, %rbx
+; CHECK-NEXT:    addq %r13, %rbx
+; CHECK-NEXT:    addq %rbp, %rbx
+; CHECK-NEXT:    addq %r14, %rbx
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    movq %rbx, %rax
+; CHECK-NEXT:    addq $168, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    retq
+
+entry:
+  %a64 = zext i32 %a to i64
+  %b64 = zext i32 %b to i64
+  %c64 = zext i32 %c to i64
+  %d64 = zext i32 %d to i64
+  %e64 = zext i32 %e to i64
+  %f64 = zext i32 %f to i64
+  %g64 = zext i32 %g to i64
+  %h64 = zext i32 %h to i64
+  %i64 = zext i32 %i to i64
+  %j64 = zext i32 %j to i64
+  %k64 = zext i32 %k to i64
+  %l64 = zext i32 %l to i64
+  %m64 = zext i32 %m to i64
+  %n64 = zext i32 %n to i64
+  %o64 = zext i32 %o to i64
+  %p64 = zext i32 %p to i64
+  %q64 = zext i32 %q to i64
+  %r64 = zext i32 %r to i64
+  %s64 = zext i32 %s to i64
+  %t64 = zext i32 %t to i64
+  %u64 = zext i32 %u to i64
+  %v64 = zext i32 %v to i64
+  %w64 = zext i32 %w to i64
+  %x64 = zext i32 %x to i64
+  %y64 = zext i32 %y to i64
+  %z64 = zext i32 %z to i64
+  call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i64 0, i64 26, i64 %a64, i64 %b64, i64 %c64, i64 %d64, i64 %e64, i64 %f64, i64 %g64, i64 %h64, i64 %i64, i64 %j64, i64 %k64, i64 %l64, i64 %m64, i64 %n64, i64 %o64, i64 %p64, i64 %q64, i64 %r64, i64 %s64, i64 %t64, i64 %u64, i64 %v64, i64 %w64, i64 %x64, i64 %y64, i64 %z64)
+  %addab = add i64 %a64, %b64
+  %addc = add i64 %addab, %c64
+  %addd = add i64 %addc, %d64
+  %adde = add i64 %addd, %e64
+  %addf = add i64 %adde, %f64
+  %addg = add i64 %addf, %g64
+  %addh = add i64 %addg, %h64
+  %addi = add i64 %addh, %i64
+  %addj = add i64 %addi, %j64
+  %addk = add i64 %addj, %k64
+  %addl = add i64 %addk, %l64
+  %addm = add i64 %addl, %m64
+  %addn = add i64 %addm, %n64
+  %addo = add i64 %addn, %o64
+  %addp = add i64 %addo, %p64
+  %addq = add i64 %addp, %q64
+  %addr = add i64 %addq, %r64
+  %adds = add i64 %addr, %s64
+  %addt = add i64 %adds, %t64
+  %addu = add i64 %addt, %u64
+  %addv = add i64 %addu, %v64
+  %addw = add i64 %addv, %w64
+  %addx = add i64 %addw, %x64
+  %addy = add i64 %addx, %y64
+  %addz = add i64 %addy, %z64
+  ret i64 %addz
+}
+
+; Demonstrate address of a function (w/ spilling due to caller saved register is used)
+define void @addr_func() gc "statepoint-example" {
+; CHECK-LABEL: addr_func:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movq _bar@{{.*}}(%rip), %rax
+; CHECK-NEXT:    movq %rax, (%rsp) ## 8-byte Spill
+; CHECK-NEXT:    callq _bar ## 8-byte Folded Reload
+; CHECK-NEXT:  Ltmp15:
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+entry:
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i64 0, i64 3, void ()* @bar, void ()* @bar, void ()* @bar)
+  ret void
+}
+
+; Demonstrate address of a global (w/ spilling due to caller saved register is used)
+ at G = external global i32
+define void @addr_global() gc "statepoint-example" {
+; CHECK-LABEL: addr_global:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movq _G@{{.*}}(%rip), %rax
+; CHECK-NEXT:    movq %rax, (%rsp) ## 8-byte Spill
+; CHECK-NEXT:    callq _bar ## 8-byte Folded Reload
+; CHECK-NEXT:  Ltmp16:
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+entry:
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i64 0, i64 3, i32* @G, i32* @G, i32* @G)
+  ret void
+}
+
+define void @addr_alloca(i32 %v) gc "statepoint-example" {
+; CHECK-LABEL: addr_alloca:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    callq _bar
+; CHECK-NEXT:  Ltmp17:
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+entry:
+  %a = alloca i32
+  store i32 %v, i32* %a
+  %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 0, i64 0, i64 3, i32* %a, i32* %a, i32* %a)
+  ret void
+}
+
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
+declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token, i32, i32)
+
+attributes #0 = { "deopt-lowering"="live-in" }
+attributes #1 = { "deopt-lowering"="live-through" }