[llvm-commits] CVS: llvm/lib/Reoptimizer/Inst/InstManip.cpp InstManip.h Phases.cpp design.txt

Tue Apr 29 13:31:01 PDT 2003

Changes in directory llvm/lib/Reoptimizer/Inst:

InstManip.cpp updated: 1.7 -> 1.8
InstManip.h updated: 1.8 -> 1.9
Phases.cpp updated: 1.12 -> 1.13
design.txt updated: 1.9 -> 1.10

---
Log message:

Phase3-generated phase 4 slots now spill global registers properly.


---
Diffs of the changes:

Index: llvm/lib/Reoptimizer/Inst/InstManip.cpp
diff -u llvm/lib/Reoptimizer/Inst/InstManip.cpp:1.7 llvm/lib/Reoptimizer/Inst/InstManip.cpp:1.8

--- llvm/lib/Reoptimizer/Inst/InstManip.cpp:1.7	Tue Apr 15 16:26:19 2003
+++ llvm/lib/Reoptimizer/Inst/InstManip.cpp	Tue Apr 29 13:36:53 2003
@@ -12,6 +12,7 @@
 #include "InstManip.h"
 
 const unsigned InstManip::NOP_INST = 0x01000000;
+uint64_t InstManip::sm_phase3SpillRegion[InstManip::SHARED_SIZE];
 
 using std::cout;
 using std::cerr;
@@ -154,16 +155,77 @@
            "Unexpected number of instructions in code sequence for call");
 }
 
+// NB: Generate restore/save currently fill the snippet (which comes from a slot) with a
+// bunch of code to save and restore the global registers.  This blows up the size of the
+// required slot quite a bit -- it would be better to generate a call to functions
+// saveGlobalRegs() and restoreGlobalRegs(), for example.  However, this works for now and
+// writing those functions means determining what the inline assembly should look like.
+// The ifdef'd-out region below is a start, but it is incomplete and generates errors at
+// assembly time. In particular, the SPARC assembly requires a '.register' directive before
+// it witnesses a use of %g2, %g3, %g6, or %g7, and that doesn't appear to be emitted simply
+// by using the inline assembly. :( TODO.
+//
+
+#if 0
+void restoreGlobRegs()
+{
+    // asm ("assembly template" : "output contraints", "input contraints")
+    // Restore the global registers %g[1-7] from the globalRegs array.
+    
+    asm("ldx %0, %%g1"::"o" (globalRegs));
+    asm("ldx %0, %%g2"::"o" (globalRegs+1));
+    asm("ldx %0, %%g3"::"o" (globalRegs+2));
+    asm("ldx %0, %%g4"::"o" (globalRegs+3));
+    asm("ldx %0, %%g5"::"o" (globalRegs+4));
+    asm("ldx %0, %%g6"::"o" (globalRegs+5));
+    asm("ldx %0, %%g7"::"o" (globalRegs+6));
+}
+#endif
+
+void InstManip::generateRestoreShared(uint64_t restoreFromAddr,
+                                      std::vector<unsigned>& snippet,
+                                      TargetRegister reg) const 
+{
+    generateLoad(restoreFromAddr, snippet, reg);
+
+    unsigned destReg = (reg == REG_0) ? R_O0 : R_O1;
+    
+    snippet.push_back(MK_LOAD_IMM(R_G1, destReg, 8));
+    snippet.push_back(MK_LOAD_IMM(R_G2, destReg, 16));
+    snippet.push_back(MK_LOAD_IMM(R_G3, destReg, 24));
+    snippet.push_back(MK_LOAD_IMM(R_G4, destReg, 32));
+    snippet.push_back(MK_LOAD_IMM(R_G5, destReg, 40));
+    snippet.push_back(MK_LOAD_IMM(R_G6, destReg, 48));
+    snippet.push_back(MK_LOAD_IMM(R_G7, destReg, 56));
+}
+
 void InstManip::generateRestore(std::vector<unsigned>& snippet) const
 {
     // restore %o0, 0, %o0
-    snippet.push_back(MK_RESTORE(R_O0, R_O0, 0));
+    snippet.push_back(MK_RESTORE_IMM(R_O0, R_O0, 0));
+}
+
+void InstManip::generateSpillShared(uint64_t spillToAddr,
+                                    std::vector<unsigned>& snippet,
+                                    TargetRegister reg) const 
+{
+    generateLoad(spillToAddr, snippet, reg);
+
+    unsigned destReg = (reg == REG_0) ? R_O0 : R_O1;
+
+    snippet.push_back(MK_STORE_IMM(R_G1, destReg, 8));
+    snippet.push_back(MK_STORE_IMM(R_G2, destReg, 16));
+    snippet.push_back(MK_STORE_IMM(R_G3, destReg, 24));
+    snippet.push_back(MK_STORE_IMM(R_G4, destReg, 32));
+    snippet.push_back(MK_STORE_IMM(R_G5, destReg, 40));
+    snippet.push_back(MK_STORE_IMM(R_G6, destReg, 48));
+    snippet.push_back(MK_STORE_IMM(R_G7, destReg, 56));
 }
 
 void InstManip::generateSave(std::vector<unsigned>& snippet) const
 {
     // save %o0, 0, %o0
-    snippet.push_back(MK_SAVE(R_O0, R_O0, 0));
+    snippet.push_back(MK_SAVE_IMM(R_O0, R_O0, 0));
 }
 
 void InstManip::generateBranchAlways(uint64_t dest,


Index: llvm/lib/Reoptimizer/Inst/InstManip.h
diff -u llvm/lib/Reoptimizer/Inst/InstManip.h:1.8 llvm/lib/Reoptimizer/Inst/InstManip.h:1.9
--- llvm/lib/Reoptimizer/Inst/InstManip.h:1.8	Fri Apr 18 12:29:00 2003
+++ llvm/lib/Reoptimizer/Inst/InstManip.h	Tue Apr 29 13:36:53 2003
@@ -43,6 +43,11 @@
         return m_insts;
     }
 
+    const std::vector<std::pair<uint64_t, unsigned> >& getInsts() const
+    {
+        return m_insts;
+    }
+
     void push_back(uint64_t addr, unsigned inst) 
     {
         m_insts.push_back(std::make_pair(addr, inst));
@@ -103,7 +108,15 @@
 
     void            generateRestore(std::vector<unsigned>& snippet) const;
     void            generateSave(std::vector<unsigned>& snippet) const;
+
+    void            generateSpillShared(uint64_t spillFromAddr,
+                                        std::vector<unsigned>& snippet,
+                                        TargetRegister reg = REG_0) const;
     
+    void            generateRestoreShared(uint64_t restorFromAddr,
+                                          std::vector<unsigned>& snippet,
+                                          TargetRegister reg = REG_0) const;
+
     void            generateBranchAlways(uint64_t dest,
                                          uint64_t slotBase,
                                          std::vector<unsigned>& snippet,
@@ -120,16 +133,21 @@
     // These are functions so when InstManip is superclassed, they'd become virtual, etc.
     // In the short term we could use class constants, but this is more clear.
     
-    unsigned        getNOP() const                 { return NOP_INST; }
-    unsigned        getGenLoadSize() const         { return 6;        }
-    unsigned        getGenCallSize() const         { return 2;        }
-    unsigned        getGenBranchAlwaysSize() const { return 2;        }
-    unsigned        getGenSaveSize() const         { return 1;        }
-    unsigned        getGenRestoreSize() const      { return 1;        }
-    unsigned        getInstWidth() const           { return 4;        }
+    unsigned        getNOP() const                   { return NOP_INST;                       }
+    unsigned        getGenLoadSize() const           { return 6;                              }
+    unsigned        getGenCallSize() const           { return 2;                              }
+    unsigned        getGenBranchAlwaysSize() const   { return 2;                              }
+    unsigned        getGenSaveSize() const           { return 1;                              }
+    unsigned        getGenSpillSharedSize() const    { return getGenLoadSize() + SHARED_SIZE; }
+    unsigned        getGenRestoreSharedSize() const  { return getGenLoadSize() + SHARED_SIZE; }
+    unsigned        getGenRestoreSize() const        { return 1;                              }
+    unsigned        getInstWidth() const             { return 4;                              }
+    unsigned        getSharedSize() const            { return SHARED_SIZE;                    }
 
     inline unsigned getAddressCopySize(unsigned loadInst) const;
 
+    uint64_t getPhase3SpillAddr() { return (uint64_t) sm_phase3SpillRegion; }
+
   private:
     InstManip() {}
 
@@ -154,7 +172,18 @@
     static const unsigned BRANCH_ALWAYS_BASE = 0x30480000;
     static const unsigned NOP_INST;
 
+    // Size (in number of 64-bit words) required for storing shared registers
+    static const unsigned SHARED_SIZE = 7;
+
     VirtualMem* m_pVM;
+    
+    // Memory region into which to spill shared registers when executing a phase 4 slot
+    // (i.e., the slot that invokes the phase4 function, the slot written by phase 3
+    // invocations).  NB: One region is sufficient and we do not need stack semantics
+    // because only one activation of a phase 4 slot ever occurs at a given time (assuming
+    // single-threaded execution).
+
+    static uint64_t sm_phase3SpillRegion[SHARED_SIZE];
 };
 
 void InstManip::printRange(uint64_t start, uint64_t end) const
@@ -208,6 +237,5 @@
 
     return 1;
 }
-
 
 #endif // _INCLUDED_INSTMANIP_H


Index: llvm/lib/Reoptimizer/Inst/Phases.cpp
diff -u llvm/lib/Reoptimizer/Inst/Phases.cpp:1.12 llvm/lib/Reoptimizer/Inst/Phases.cpp:1.13
--- llvm/lib/Reoptimizer/Inst/Phases.cpp:1.12	Fri Apr 18 12:29:00 2003
+++ llvm/lib/Reoptimizer/Inst/Phases.cpp	Tue Apr 29 13:36:53 2003
@@ -43,11 +43,13 @@
 //         3c. In the new slot, write the contents of the phase 3 slot:
 //                  +---------------------------------------+
 //                  |             save registers            |
+//                  |           save global registers       |
 //                  | copy load-src addr to param1 register |
 //                  | load p4 struct ptr to param2 register |
 //                  |             call to phase 4           |
 //                  |                  nop                  |
 //                  |             restore registers         |
+//                  |        restore global registers       |
 //                  |          branch back to orig code     |
 //                  |                  nop                  |
 //                  +---------------------------------------+
@@ -56,6 +58,26 @@
 //
 // PHASE 4:
 //
+//      1. Examine the tag (i.e. load-src addr) passed by phase 3
+//        1a. If tag is in GBT, we have a valid candidate, so do step 2.
+//        1b. If tag is not in GBT, our candidate is invalid, so delete slot and return to
+//        original code.
+//        
+//      2. Set up the second phase 4 slot that will actually call the instrumentation function:
+//                  +---------------------------------------+
+//                  |             save registers            |
+//                  |           save global registers       |
+//                  |           call to inst func           |
+//                  |                  nop                  |
+//                  |             restore registers         |
+//                  |        restore global registers       |
+//                  |          branch back to orig code     |
+//                  |                  nop                  |
+//                  +---------------------------------------+
+//      This "instrumentation slot" may have to be expanded later to store the return value
+//      in an alloca'd temporary, unless the phase4 function itself can invoke the
+//      instrumentation function, would be *highly* ideal.
+//
 
 #include <stdlib.h>
 #include <iostream>
@@ -79,11 +101,13 @@
 // obtained in the same manner.
 
 extern unsigned ppGBTSize;
-extern struct PrimInfo {
+typedef struct PrimInfo {
     unsigned gbtType;
     unsigned short* loadVar;
     unsigned gbtStartIdx;
-} ppGBT[];
+};
+
+extern PrimInfo ppGBT[];
 
 typedef std::pair<uint64_t, uint64_t> AddressRange;
 
@@ -184,8 +208,8 @@
 };
 
 // Phase3 is the class that is responsible for making the "phase 3" transformation; the
-// global function phase3() is responsible for constructing a one Phase3 instance per
-// invocation and for deallocating the originating slot.
+// global function phase3() is responsible for constructing one Phase3 instance per
+// invocation and invoking transform on it.
 
 class Phase3 
 {
@@ -206,6 +230,28 @@
     InstManip   m_instManip;
 };
 
+// Phase4 is the class that is responsible for making the "phase 4" transformation; the
+// global function phase4() is responsible for constructing one Phase4 instance per
+// invocation and invoking transform on it.
+
+class Phase4
+{
+  public:
+    Phase4(uint64_t tag, Phase4Info* p4info);
+    ~Phase4();
+
+    void transform();
+
+  private:
+    Phase4(): m_instManip(0) {}
+
+    inline unsigned getSlotSize() const;
+
+    Phase4Info* m_pPhase4Info;
+    TraceCache* m_pTraceCache;
+    InstManip   m_instManip;
+    uint64_t    m_tag;         // Entry to look for in the GBT
+};
 
 //////////////// Phase 2 implementation ////////////////
 
@@ -250,12 +296,12 @@
 static void copySnippetToSlot(vector<unsigned>& snippet,
                               uint64_t slotBase,
                               VirtualMem* vm,
-                              InstManip* im) 
+                              InstManip& im) 
 {
     uint64_t currAddr = slotBase;
     for(vector<unsigned>::iterator i = snippet.begin(), e = snippet.end(); i != e; ++i) {
         vm->writeInstToVM(currAddr, *i);
-        currAddr += im->getInstWidth();
+        currAddr += im.getInstWidth();
     }
 }
 
@@ -289,7 +335,7 @@
 
     // Copy the snippet code into the slot
     assert(snippet.size() == getSlotSize() && "Snippet size does not match slot size");
-    copySnippetToSlot(snippet, slotBase, vm, &m_instManip);
+    copySnippetToSlot(snippet, slotBase, vm, m_instManip);
 }
 
 unsigned Phase2::getSlotSize() const
@@ -344,6 +390,21 @@
     delete m_pPhase3Info;
 }
 
+static uint64_t replaceInstWithBrToSlot(uint64_t srcAddr,
+                                        unsigned slotSize,
+                                        TraceCache* tc,
+                                        InstManip& im) 
+{
+    // Obtain a new slot of the given size
+    uint64_t slotBase = tc->getMemMgr()->getMemory(slotSize);
+    assert(slotBase && "Unable to obtain memory from MemoryManager instance");
+
+    // Replace instruction at srcAddr with branch to start of new slot
+    tc->getVM()->writeInstToVM(srcAddr, im.getBranchAlways(slotBase, srcAddr));
+
+    return slotBase;
+}
+
 void Phase3::processCandidates(vector<InstCandidate>& candidates) 
 {
     // For each load candidate, obtain a new slot and write the phase 3 slot region
@@ -352,6 +413,7 @@
     for(vector<InstCandidate>::iterator i = candidates.begin(), e = candidates.end(); i != e; ++i) {
         cerr << "Transforming " << *i << endl;
 
+#if 0
         uint64_t slotBase = m_pTraceCache->getMemMgr()->getMemory(getSlotSize(*i));
         assert(slotBase && "Unable to obtain memory from MemoryManger instance");
 
@@ -359,6 +421,10 @@
         VirtualMem* vm = m_pTraceCache->getVM();
         uint64_t loadAddr = i->front().first;
         vm->writeInstToVM(loadAddr, m_instManip.getBranchAlways(slotBase, loadAddr));
+#endif
+        // Replace load candidate instruction with a branch to the start of a new slot.
+        uint64_t slotBase = replaceInstWithBrToSlot(i->front().first, getSlotSize(*i),
+                                                    m_pTraceCache, m_instManip);
 
         // Generate a) code to save the registers, b) instruction(s) to store the load
         // source address into a phase4 parameter register, c) the load of (the
@@ -368,11 +434,15 @@
 
         Phase4Info* p4info = new Phase4Info(*i, slotBase, getSlotSize(*i), m_pTraceCache);
 
+        uint64_t spillAddr = m_instManip.getPhase3SpillAddr();
+        
         vector<unsigned> snippet;
         m_instManip.generateSave(snippet);
-        m_instManip.generateAddressCopy(i->front().second, snippet);
+        m_instManip.generateAddressCopy(i->front().second, snippet); // Uses InstManip::REG_0, live to call
+        m_instManip.generateSpillShared(spillAddr, snippet, InstManip::REG_1);
         m_instManip.generateLoad((uint64_t) p4info, snippet, InstManip::REG_1);
         m_instManip.generateCall((uint64_t) &phase4, slotBase, snippet);
+        m_instManip.generateRestoreShared(spillAddr, snippet);
         m_instManip.generateRestore(snippet);
         m_instManip.generateBranchAlways(i->front().first, slotBase, snippet);
 
@@ -387,7 +457,7 @@
 
         // Copy the snippet code into the slot
         assert(snippet.size() == getSlotSize(*i) && "Snippet size does not match slot size");
-        copySnippetToSlot(snippet, slotBase, vm, &m_instManip);
+        copySnippetToSlot(snippet, slotBase, m_pTraceCache->getVM(), m_instManip);
 
         // just one candidate for now
         break;
@@ -401,8 +471,10 @@
 
     return m_instManip.getGenSaveSize() +
         m_instManip.getAddressCopySize(cand.front().second) +
+        m_instManip.getGenSpillSharedSize() +
         m_instManip.getGenLoadSize() +
         m_instManip.getGenCallSize() +
+        m_instManip.getGenRestoreSharedSize() +
         m_instManip.getGenRestoreSize() +
         m_instManip.getGenBranchAlwaysSize();
 }
@@ -422,33 +494,120 @@
 
 //////////////// Phase4 implementation ////////////////
 
-void phase4(uint64_t tag, Phase4Info* p4info) 
+void phase4(uint64_t tag, Phase4Info* p4info)
 {
-    cerr << "phase4 invoked!" << endl;
+    cerr << "phase 4 fcn, tag is " << tag << endl;
+    Phase4 p4(tag, p4info);
+    p4.transform();
+}
 
-    cerr << "tag is " << std::hex << tag << endl;
+Phase4::Phase4(uint64_t tag, Phase4Info* p4info):
+    m_pPhase4Info(p4info),
+    m_pTraceCache(p4info->getTraceCache()),
+    m_instManip(p4info->getTraceCache()->getVM()),
+    m_tag(tag)
+{
+    cerr << "phase4 ctor: tag is " << tag << endl;
+    cerr << "================ Begin Phase 4 ================\n";
+}
 
-    cerr << "inst candidate inside info structure is: " << endl;
-    cerr << p4info->getCandidate() << endl;
+Phase4::~Phase4() 
+{
+    // Deallocate the originating slot (i.e. the slot that invoked us).
+    // 
+    // NB: Yes, we are, in fact, deallocating a memory segment (i.e., the slot obtained by
+    // the TraceCache's MemoryManager instance) before returning to it. This is not a
+    // problem for single-threaded codes, because no threads may claim that memory and
+    // write to it.  However, it does indeed pose a problem for multi-threaded codes.  A
+    // modification to the general mechanism itself is required to achieve thread-safety.
 
-    // (TEMP) For now, restore the candidate load to its original position for debugging
-    // purposes.
+    uint64_t slotBase = m_pPhase4Info->getSlot();
+    unsigned slotSize = m_pPhase4Info->getSlotSize();
+    m_pTraceCache->getMemMgr()->freeTraceMemory(slotBase, slotSize);
 
-    p4info->getTraceCache()->getVM()->writeInstToVM(p4info->getCandidate().front().first,
-                                                    p4info->getCandidate().front().second);
-    delete p4info;
+    // Deallocate the parameter structure
+    delete m_pPhase4Info;
+}
 
-    cerr << "ppGBT is: " << ppGBT << endl;
-    cerr << "ppGBTSize is: " << ppGBTSize << endl;
+static void dumpGBT(std::ostream& ostr) 
+{
+    ostr << "ppGBT is: " << ppGBT << endl;
+    ostr << "ppGBTSize is: " << ppGBTSize << endl;
 
-    for(int i = 0; i < ppGBTSize; ++i) {
-        cerr << "ppGBT[" << i << "]: " << ppGBT[i].gbtType << ", "
+    for(unsigned i = 0; i < ppGBTSize; ++i) {
+        ostr << "ppGBT[" << i << "]: " << ppGBT[i].gbtType << ", "
              << ppGBT[i].loadVar << ", " << ppGBT[i].gbtStartIdx << endl;
     }
-    
-    // tmp
-    if(tag == (uint64_t)(ppGBT[0].loadVar)) {
-        cerr << "TAG MATCHES, BOYYYYYYYYYYY!" << endl;
+}
+
+static PrimInfo* searchGBT(uint64_t tag)
+{
+    // Traverse the GBT and determine if the tag is there.
+    for(unsigned i = 0; i < ppGBTSize; ++i) {
+        uint64_t tagInTable = (uint64_t) ppGBT[i].loadVar;
+        if(tagInTable == tag)
+            return &ppGBT[i];
     }
-    // tmp
+    return 0;
+}
+
+void fakeInstFunc(double* param)
+{
+    cerr << "I AM AN INSTRUMENTATION FUNCTION, FEAR ME!" << endl;
+    *param = 3.14;
+}
+
+void Phase4::transform()
+{
+    cerr << "tag is " << m_tag << endl;
+    dumpGBT(cerr);
+
+    if(PrimInfo* pi = searchGBT(m_tag)) {
+        cerr << "Tag matches." << endl;
+
+        const InstCandidate& cand = m_pPhase4Info->getCandidate();
+#if 0
+        // Make a new slot that calls the instrumentation function, inserting a branch to
+        // it over the original code.
+
+        uint64_t slotBase = replaceInstWithBrToSlot(cand.front().first, getSlotSize(),
+                                                    m_pTraceCache, m_instManip);
+#endif
+
+        // Write NOPs over the original instructions that were associated with the elected
+        // candidate, but leave the branch instruction intact.
+
+        VirtualMem* vm = m_pTraceCache->getVM();
+        for(vector<std::pair<uint64_t, unsigned> >::const_iterator i = cand.getInsts().begin() + 1,
+                e = cand.getInsts().end(); i != e; ++i)
+            vm->writeInstToVM(i->first, m_instManip.getNOP());
+
+        // Write the instructions to call the instrumentation function
+
+        void* instFuncVP = (void*) fakeInstFunc; // From the GBT eventually
+        void (*instFunc)(void*) = (void (*)(void*)) instFuncVP;
+        
+        void* mem = malloc(sizeof(double));
+        instFunc(mem);
+        printf("%f\n", *((double*) mem));
+        free(mem);
+    }
+    else {
+        cerr << "Could not find tag" << endl;
+        // The candidate failed to get elected, so pack up and go home.  Restore the
+        // replaced instruction (i.e. the branch that invoked this code) with the original
+        // instruction at that location.
+        
+        VirtualMem* vm = m_pPhase4Info->getTraceCache()->getVM();
+        vm->writeInstToVM(m_pPhase4Info->getCandidate().front().first,
+                          m_pPhase4Info->getCandidate().front().second);
+    }
+
+    // (TEMP) For now, restore the candidate load to its original position for debugging
+    // purposes.
+
+    m_pPhase4Info->getTraceCache()->getVM()->writeInstToVM(m_pPhase4Info->getCandidate().front().first,
+                                                           m_pPhase4Info->getCandidate().front().second);
+
+    cerr << "================ End Phase 4 ================\n";
 }


Index: llvm/lib/Reoptimizer/Inst/design.txt
diff -u llvm/lib/Reoptimizer/Inst/design.txt:1.9 llvm/lib/Reoptimizer/Inst/design.txt:1.10
--- llvm/lib/Reoptimizer/Inst/design.txt:1.9	Fri Apr 18 12:29:00 2003
+++ llvm/lib/Reoptimizer/Inst/design.txt	Tue Apr 29 13:36:53 2003
@@ -886,12 +886,15 @@
 
 {{{ TODO
 
-  - Investigate trace-cache dummy function mechanisms, decide on approach A or B
-    in phase outline
+    - Get phase 2 allocation of spill space working, write spill code (to spill space) for
+      phase 3 invocation. (Currently NO spilling is being done, which is not safe)
 
-  - Implement phase outline
+    - Ensure phase 3 writes proper spill code for phase 4 invocation. (One spill space
+    should be sufficient)
 
-  - Read EEL paper to get a better feel for binary modification issues
+    - Start table-of-stacks implementation for phase4 authorship of phase 5 slots.
+
+    - Write phase 5 slot generation code, phase 5 function itself, etc.
 
 }}}
 
@@ -1006,13 +1009,10 @@
   Approach A:
 
   3e. Write phase 4 code in slot:
-      if(actually an instrumentation site)
-          rewrite branch at C to next instruction
-	  call proper instrumentation fnction <- C branches to here
-          branch back to C
-      else
-          restore original instructions
-          branch back to C
+
+      Load address being loaded by candidate load instruction.
+      Call phase 4 function
+      branch back to C
 
   Approach B:
 
@@ -1025,7 +1025,134 @@
  	restore original instructions
  	branch back to C
 
-  In phase 4: No special action needed.
+  In phase 4: 
+
+  Actions of phase 4 function.
+
+  1. Check tag to verify GBT membership. If not found in GBT, do nothing besides
+      return to the origial code, etc. 
+
+   2. Assuming tag is valid, we must decide between one of two approaches at this
+   juncture:
+
+   a) Try to invoke the instrumentation function directly from within phase 4. 
+   b) Write code in yet another slot that will invoke the inst function.
+
+   The primary problem to solve in both of these approaches is how to allocate space for
+   values that are stored to / read from by the instrumentation function.  That is, for
+   point metrics, we must construct the semantic equivalent of a function call like:
+
+                         foo = someInstFunc();
+
+   where storage for foo has already been allocated by phase 1 (hence we can store its
+   address in the GBT). However, we know nothing about the *type* of the return value,
+   only its size.  We must determine the conventions of the call mechanisms for passing
+   back large (i.e. bigger than a register size) objects by value.  We can call
+   someInstFunc easily enough, but we must know how to write the code (using either
+   approach a or b above) to take the return value of the function and store it to the
+   metric variable. Phase 1 can store the address and the size of this variable, so it
+   should be simple enough to take the returned-by-value return value of the
+   instrumentation function and perform a memcpy to the appropriate location.  This works
+   fine for point metrics, but the problem is worsened significantly by region metrics,
+   because we must have a temporary value around to store the return value of the start
+   function and pass it by address to the end function...this probably has to be
+   accomplished via heap storage -- we had previous thought that alloca would be
+   sufficient, but I don't think this works (the runtime stack is manipulated in between
+   the time that the alloca'd variable would be stored to and the time that it would be
+   read, in the case of interval metrics.
+
+   Looks like we're going to have to do everything from the standpoint of parameter sizes,
+   memcpy's, and heap-allocated temporaries. The only way I can think of to do this in a
+   straightforward manner is to use the phase 4 function itself to do the call to the inst
+   function and the subsequent memcpy. But then we must compile the call to the inst
+   function, and we don't know what type it returns, because this information is not
+   preserved.  If it returns, say, a scalar double, how do we store this value in a
+   temporary and copy it to the metric variable (in the case of point metrics, for
+   example)? The only thing I can think of at present is to change the signature
+   conventions...instead of an instrumentation function returning stuff by value, it is
+   instead passed a pointer parameter.  Since we will know the sizes of the types from
+   phase 1, we can always heap-allocated the appropriately-sized parameter and pass this
+   raw pointer in to be used by the function as appropriate...but what kinds of problems
+   can be caused here? This is a good topic for conversation with Vikram...talked to
+   Vikram, and the problem is worse that I had originally thought. Heap allocation isn't
+   really an option because we would have to have one heap alloc/dealloc per interval
+   invocation, which is just too expensive.
+
+   A more accurate assessment of the problem.
+
+   We must have a stack-oriented way of saving temporary values between the start interval
+   function and the end interval function.  We had thought that we could do this via
+   alloca (i.e. manipulation of the stack pointer to obtain new space).  However, the only
+   way this can occur is if we use the current stack frame. Let's say that we want to
+   allocate n slots (i.e. n * 8 bytes).  Then, we would do:
+
+   %sp = %sp + (n * 8)
+   %reg = %sp + B + X
+
+   Where reg is just some register (we must spill/restore it before we clobber it here),
+   and X is the offset from %sp + B to the location on the stack where the newly-alloca'd
+   region is to start.  This must be "lower" than any previous allocas but "higher" than
+   the end of the contents starting at %sp + B (B is the bias).  According to the SparcV9
+   ABI, the size of X is equal to 128 bytes (for register spills) + 48 bytes (6 outgoing
+   registers, each with extended word size) + Q, where Q is the space required for "extra
+   outgoing arguments", that is, arguments to functions beyond the 6th.  Q is equal to the
+   greatest number of parameters of any function call within the function body associated
+   with the stack frame (-6, or 0 if no call has parameter width exceeding 6).  For
+   example, if S is the stack frame associated with a particular invocation of the
+   function foo, and foo called some function, bar, that took 10 parameters, and no other
+   function was called by foo that had greater than 10 parameters, Q would exactly equal
+   4.
+
+   The problem is that, although the value of Q is known at compile-time, determining it
+   during phase 1 is premature (the vendor compiler may arbitarily add arguments to
+   functions, for example), and determining it at runtime (on the assembly code itself) is
+   quite possibly not feasible (indirect functions, no way to really determine what are
+   parameters and what are not, etc). We currently do not have a way to obtain this value,
+   and so an attempt to solve this problem using the alloca approach must be abandoned.
+
+   One easy solution that presents itself is to do heap-based stuff, but this is very
+   inefficient and also quite expensive.
+
+   Idea: Manage a stack on the side.  The objection to this is that it involves extra
+   function calls.
+
+   In the meantime, in the interests of making forward progress, can we do anything with
+   heap allocation? Remember that we need a stack region into which we can spill the
+   global & FP registers, as well as the data between start- and end-interval functions.
+
+   A note about saving/restoring the global and FP registers, we know that the slot
+   created by phase 4 executes only once. Hence, it is valid to have phase 3 heap-allocate
+   a region large enough to spill the registers (the spill code would be placed in the
+   slot that calls the phase 4 function) and restore the registers.  The phase 4 function
+   would have to deallocate this heap region, which means that the epilogue in the phase 4
+   slot would be restoring the registers from a deleted chunk.  Or, a call to free the
+   chunk could be placed in the slot itself.  This is really the same problem as slot
+   deallocation in general, and shouldn't be a problem in single-threaded codes.  However,
+   we must determine a mechanism by which heap allocation can occur for the register
+   spills, and the use of the allocated regions must correspond on a per-invocation basis
+   (i.e. stack semantics) appropriately.  One idea is to use a "one-off" approach -- for
+   example, the phase 4 function would heap-allocate a spill region (R) to be used by the
+   first "real" invocation of the instrumentation (phase 5?).  Each phase 5 invocation
+   would spill to and restore from region R, and would allocate a heap region (R') to be
+   used by the next invocation of the instrumentation. Of course, there'd have to be a new
+   slot created that would spill to this new region, etc.  The regions (heap and slot)
+   could only be recycled as the call stack was popped. This is so gross I don't think
+   that it is an option.  So, spilling the global and FP registers is even more of a
+   problem than the data transfer between the start- and end-function invocations, and I
+   think we have to go back to a global stack approach.
+
+   Phase 4 initially creates a (large) heap region which will act as the global stack.  It
+   writes the phase-5 slot to use this address to spill to, and the stack base is held
+   onto somehow.  The phase 5 slot spills to the current stack pointer, and invokes the
+   phase 5 function.  The phase 5 function will allocate space at stack pointer + regsave
+   size for whatever data needs to pass between the start- and end- functions.  A pointer
+   to the start of the storage region for start- function is passed into the function,
+   etc, and the OFFSET FROM THE STACK POINTER is stored in the field in the INTERVAL_START
+   record.  The phase 5 slot (after the call to the phase 5 function) restores from the
+   current stack pointer, but *does not change the stack pointer*.  The phase 5 invocation
+   would also reallocate the stack space if it detected that more space was needed
+   (important but not vital for the prototype implementation -- it can be "big enough" in
+   the initial implementation).  See handwritten notes for more detail.
 
 {{{ Notes on using the total-copy approach in the prototype implementation.