[llvm-commits] CVS: llvm/lib/Reoptimizer/Inst/lib/PhaseInfo.h Phases.cpp SparcInstManip.cpp design.txt

Tue Jun 24 10:41:08 PDT 2003

Changes in directory llvm/lib/Reoptimizer/Inst/lib:

PhaseInfo.h updated: 1.9 -> 1.10
Phases.cpp updated: 1.35 -> 1.36
SparcInstManip.cpp updated: 1.16 -> 1.17
design.txt updated: 1.15 -> 1.16

---
Log message:

Merge from branch jrsdev.



---
Diffs of the changes:

Index: llvm/lib/Reoptimizer/Inst/lib/PhaseInfo.h
diff -u llvm/lib/Reoptimizer/Inst/lib/PhaseInfo.h:1.9 llvm/lib/Reoptimizer/Inst/lib/PhaseInfo.h:1.10

--- llvm/lib/Reoptimizer/Inst/lib/PhaseInfo.h:1.9	Sun May 18 15:33:46 2003
+++ llvm/lib/Reoptimizer/Inst/lib/PhaseInfo.h	Tue Jun 24 10:39:52 2003
@@ -27,12 +27,9 @@
 typedef std::pair<uint64_t, uint64_t> AddressRange;
 
 typedef struct GBTElem {
-    unsigned gbtType;
-    unsigned short* loadVar;
-    unsigned gbtStartIdx;
-    unsigned paramSize;
-    void*    retVal;
-    void*    instFunc;
+    unsigned      siteID;
+    unsigned      gbtType;
+    unsigned      short* loadVar;
 };
 
 class Phase3Info 


Index: llvm/lib/Reoptimizer/Inst/lib/Phases.cpp
diff -u llvm/lib/Reoptimizer/Inst/lib/Phases.cpp:1.35 llvm/lib/Reoptimizer/Inst/lib/Phases.cpp:1.36
--- llvm/lib/Reoptimizer/Inst/lib/Phases.cpp:1.35	Thu May 22 22:26:12 2003
+++ llvm/lib/Reoptimizer/Inst/lib/Phases.cpp	Tue Jun 24 10:39:52 2003
@@ -45,10 +45,13 @@
 //
 //      2. Change the branch to the phase 4 slot to branch to a (new) phase 5 slot. See
 //      appropriate InstManip instance for detailed information about phase 5 slot
-//      contents.
+//      contents.  If there is no registered instrumentation for the phase 5 slot, leave a
+//      nop in place of the branch to reduce runtime overhead.
 //
 //      3. Deallocate the slot that originated this invocation of phase4().
 //
+// PHASE 5: Phase 5 isn't like the other phases; rather, it simply invokes all
+//          registered instrumentation functions for a particular site.
 
 #include <algorithm>
 #include <iomanip>
@@ -56,6 +59,7 @@
 #include <set>
 #include <stdlib.h>
 #include <vector>
+#include <list>
 
 #include "llvm/Reoptimizer/Inst/ElfReader.h"
 #include "llvm/Reoptimizer/MemoryManager.h"
@@ -146,11 +150,173 @@
     uint64_t    m_tag;         // Entry to look for in the GBT
 };
 
+// InstFunctionInfo is the class used to represent information (e.g., address of return
+// value, pointer-to-instrumentation function) about particular registered instrumentation
+// functions.  In the case of end-interval functions, the link 'm_pStart' is filled in to
+// refer to the InstFunctionInfo instance that contains information about the
+// corresponding start-interval function.
+
+class InstFunctionInfo 
+{
+  public:
+    InstFunctionInfo(void* pRetVal,
+                     void* pInstFunc,
+                     InstFunctionInfo* pStart = 0):
+        m_pRetVal(pRetVal),
+        m_pInstFunc(pInstFunc),
+        m_pStart(pStart),
+        m_invoked(false)
+    {
+    }
+
+    InstFunctionInfo():
+        m_pRetVal(0),
+        m_pInstFunc(0),
+        m_pStart(0),
+        m_invoked(false)
+    {
+    }
+
+    void invoke();
+
+  protected:
+    void*             m_pRetVal;
+    void*             m_pInstFunc;
+    InstFunctionInfo* m_pStart;   // Info about start-interval function
+    bool              m_invoked;  // Has this function been invoked yet?
+};
+
+// InstSiteInfo instances contain information about the state of particular
+// instrumentation sites.  More specifically, it holds the instrumentation
+// status (e.g. whether or not the site has been handled by phase 4 yet) of the
+// sites as well as the list of (pointers to) the InstFunctionInfo instances
+// registered with a particular site.
+
+class InstSiteInfo 
+{
+  public:
+    InstSiteInfo():
+        m_branchInstalled(false),
+        m_instrumented(false)
+    {
+    }
+
+    // For start-interval sites only -- allocates memory for the return value of
+    // the instrumentation function (size of allocated memory is retValBytes).
+    // Returns a pointer to the InstFunctionInfo* that corresponds to the
+    // instrumentation function, or null if the function has already been
+    // registered.
+
+    InstFunctionInfo* push_back(unsigned retValBytes,
+                                void* func);
+
+    // For end-interval sites (w/ optional link to corresponding start); other
+    // site types should use this routine as well.  The provided retVal ptr is
+    // used as the return-value parameter of the instrumentation function.
+
+    void push_back(void* retVal,
+                   void* func,
+                   InstFunctionInfo* startInfo = 0);
+
+    void invokeFunctions();
+
+  protected:
+    bool                      m_branchInstalled;     // Installed branch to slot yet?
+    bool                      m_instrumented;        // Has phase 4 instrumented site?
+    uint64_t                  m_brInstallAddr;       // Address to install branch inst
+    unsigned                  m_branchInst;          // The branch inst to install
+    std::set<void*>           m_registeredFuncs;     // Set of func-ptrs registered here
+    vector<InstFunctionInfo*> m_instFuncInfos;       // Info for all registered funcs
+
+    friend void Phase4::transform();
+
+    void installBranch();
+
+    bool isRegistered(void* func) 
+    {
+        return m_registeredFuncs.find(func) != m_registeredFuncs.end();
+    }
+
+    void setInstrumented() { m_instrumented = true; }
+
+    void setBranchInst(uint64_t addr, unsigned branchInst)
+    {
+        m_brInstallAddr = addr;
+        m_branchInst = branchInst;
+    }
+};
+
+// InstInfo is the class that holds data about the instrumentation that gets
+// bound to instrumentation sites and intervals at runtime.  There should only
+// be on instance of this class (i.e., it is a singleton class).  The
+// implementation hides an STL map that maps the unique identifier associated
+// with an instrumentation interval/site to a pair of InstSiteInfo instances,
+// which contains the information about the instrumentations registered for the
+// given interval or point site.  In the case of intervals, the first element of
+// the pair is the InstSiteInfo instance that contains data about the start
+// site, whereas the second element of the pair contains data about the end
+// site.  For point sites, only the first element of the pair contains valid
+// data.
+
+class InstInfo 
+{
+  public:
+    typedef std::pair<InstSiteInfo, InstSiteInfo> SiteInfoPair;
+
+    static InstInfo* instance() 
+    {
+        if(!m_pInstance)
+            m_pInstance = new InstInfo;
+        return m_pInstance;
+    }
+
+    static InstSiteInfo* findSiteInfo(unsigned siteID, unsigned gbtType) 
+    {
+        SiteInfoPair* sip = instance()->findSiteInfo(siteID);
+        InstSiteInfo* siteInfo;
+
+        switch(gbtType) {
+            case pp::GBT_INTERVAL_START: siteInfo = &sip->first; break;
+            case pp::GBT_INTERVAL_END:   siteInfo = &sip->second; break;
+            default: assert(0 && "Unhandled gbtType encountered"); break;
+        }
+
+        return siteInfo;
+    }
+
+    SiteInfoPair* findSiteInfo(unsigned siteID)
+    {
+        SiteInfoMap::iterator i = m_siteInfoMap.find(siteID);
+        if(i == m_siteInfoMap.end())
+            return &m_siteInfoMap[siteID];
+        return &i->second;
+    }
+
+    void setVM(VirtualMem* vm) { m_pVM = vm;   }
+    VirtualMem* getVM() const  { return m_pVM; }
+
+  protected:
+    typedef std::map<unsigned, SiteInfoPair> SiteInfoMap;
+
+    InstInfo(): m_pVM(0)
+    {
+    }
+    
+    SiteInfoMap m_siteInfoMap;
+    VirtualMem* m_pVM;
+
+  private:
+    static InstInfo* m_pInstance;
+};
+
+InstInfo* InstInfo::m_pInstance = 0;
+
 //////////////// Phase 2 implementation ////////////////
 
 extern "C" void phase2() 
 {
     TraceCache* pTC = new TraceCache();
+    InstInfo::instance()->setVM(pTC->getVM());
     Phase2 ph(pTC, new SparcInstManip(pTC));
     ph.transform();
 }
@@ -192,7 +358,7 @@
         if(m_excludeSet.find(i->first) == m_excludeSet.end()) {
             // Function is not in exclude set, so go ahead and transform it
 
-            DEBUG_MSG(1, "Transforming function " << i->first
+            DEBUG_MSG(4, "Transforming function " << i->first
                       << "[" << HEX(i->second.first)
                       << ", " << HEX(i->second.second) << "]...\n");
             
@@ -217,17 +383,30 @@
     ::doFlush(slotBase, slotBase + im->getInstWidth() * snippet.size());
 }
 
+static uint64_t makeNewSlot(uint64_t srcAddr,
+                            unsigned slotSize,
+                            unsigned& branchInst,
+                            TraceCache* tc,
+                            InstManip* im)
+{
+    // Return a branch instruction to the new slot via branchInst.
+    uint64_t slotBase = tc->getMemMgr()->getMemory(slotSize);
+    assert(slotBase && "Unable to obtain memory from MemoryManager instance");
+
+    branchInst = im->getBranchAlways(slotBase, srcAddr);
+    return slotBase;
+}
+
 static uint64_t replaceInstWithBrToSlot(uint64_t srcAddr,
                                         unsigned slotSize,
                                         TraceCache* tc,
                                         InstManip* im) 
 {
-    // Obtain a new slot of the given size
-    uint64_t slotBase = tc->getMemMgr()->getMemory(slotSize);
-    assert(slotBase && "Unable to obtain memory from MemoryManager instance");
+    unsigned branchInst;
+    uint64_t slotBase = makeNewSlot(srcAddr, slotSize, branchInst, tc, im);
 
     // Replace instruction at srcAddr with branch to start of new slot
-    tc->getVM()->writeInstToVM(srcAddr, im->getBranchAlways(slotBase, srcAddr));
+    tc->getVM()->writeInstToVM(srcAddr, branchInst);
     ::doFlush(srcAddr, srcAddr + im->getInstWidth());
 
     return slotBase;
@@ -363,8 +542,8 @@
         vector<unsigned> snippet;
         m_pIM->buildSlot(p4info, snippet);
 
-        DEBUG_MSG(3, "phase4 slot instructions:\n");
-#if VERBOSE > 2
+        DEBUG_MSG(4, "phase4 slot instructions:\n");
+#if VERBOSE > 3
         dumpSnippet(snippet, m_pIM);
 #endif
 
@@ -430,7 +609,7 @@
 
     for(unsigned i = 0; i < ppGBTSize; ++i) {
         ostr << "[pp] ppGBT[" << i << "]: " << ppGBT[i].gbtType << ", "
-             << ppGBT[i].loadVar << ", " << ppGBT[i].gbtStartIdx << endl;
+             << ppGBT[i].loadVar << endl;
     }
 }
 
@@ -460,29 +639,44 @@
                && "Unexpected number of instructions in candidate");
 
         // Write NOPs over the original instructions that were associated with the elected
-        // candidate.  No need to no-op over the candidate load instruction itself since
-        // we're about to write over it with a branch to the phase 5 slot.
+        // candidate.
 
         VirtualMem* vm = m_pTC->getVM();
-        for(vector<std::pair<uint64_t, unsigned> >::const_iterator i = cand.getInsts().begin() + 1,
+        for(vector<std::pair<uint64_t, unsigned> >::const_iterator i = cand.getInsts().begin(),
                 e = cand.getInsts().end(); i != e; ++i)
             vm->writeInstToVM(i->first, m_pIM->getNOP());
 
-        // Obtain memory (& rewrite branch) to the phase 5 jump slot.
+        // Obtain new slot, the phase 5 jump slot.
         
         unsigned slotSize = m_pIM->getSlotSize(this);
         uint64_t repAddr = cand.front().first;
-        uint64_t slotBase = replaceInstWithBrToSlot(repAddr, slotSize, m_pTC, m_pIM);
+        unsigned branchInst;
+        uint64_t slotBase = makeNewSlot(repAddr, slotSize, branchInst, m_pTC, m_pIM);
 
         vector<unsigned> snippet;
         m_pIM->buildSlot(gbte, slotBase, repAddr, m_pPhase4Info->getRange(), snippet);
 
-        DEBUG_MSG(3, "phase 5 slot contents:\n");
-#if VERBOSE > 2
+        DEBUG_MSG(4, "phase 5 slot contents:\n");
+#if VERBOSE > 3
         dumpSnippet(snippet, m_pIM);
 #endif
 
         copySnippetToSlot(snippet, slotBase, m_pTC->getVM(), m_pIM);
+
+        // Grab the information about this particular site.
+
+        assert(gbte->gbtType == pp::GBT_INTERVAL_START ||
+               gbte->gbtType == pp::GBT_INTERVAL_END &&
+               "Unhandled gbtType encountered (must implement)");
+        
+        InstSiteInfo* siteInfo = InstInfo::findSiteInfo(gbte->siteID, gbte->gbtType);
+
+        // Take steps to install the branch; note that the InstSiteInfo instance
+        // knows whether or not to actually write the branch instruction, etc.
+
+        siteInfo->setInstrumented();
+        siteInfo->setBranchInst(repAddr, branchInst);
+        siteInfo->installBranch();
     }
     else {
         DEBUG_MSG(1, "does not match\n");
@@ -495,14 +689,6 @@
                           m_pPhase4Info->getCandidate().front().second);
     }
 
-#if 0
-    // (TEMP) For now, restore the candidate load to its original position for debugging
-    // purposes.
-
-    m_pPhase4Info->getTraceCache()->getVM()->writeInstToVM(m_pPhase4Info->getCandidate().front().first,
-                                                           m_pPhase4Info->getCandidate().front().second);
-#endif
-
     DEBUG_MSG(1, "================ End Phase 4 ================\n");
 }
 
@@ -510,27 +696,131 @@
 
 void phase5(GBTElem* gbte)
 {
-    switch(gbte->gbtType){
-        case pp::GBT_INTERVAL_START: {
-            DEBUG_MSG(1, "--- phase 5 start site invocation ---\n");
-            DEBUG_MSG(2, "retVal address is " << HEX(gbte->retVal) << endl);
-
-            void (*instFunc)(void*) = (void (*)(void*)) gbte->instFunc;
-            instFunc(gbte->retVal);
-            break;
-        }
-        case pp::GBT_INTERVAL_END: {
-            DEBUG_MSG(1, "--- phase 5 end site invocation ---\n");
-            DEBUG_MSG(2, "start parameter is at gbt index " << gbte->gbtStartIdx << endl);
-            DEBUG_MSG(2, "start parameter addr is "
-                      << HEX(ppGBT[gbte->gbtStartIdx].retVal) << endl);
+    DEBUG_MSG(1, "================ Begin Phase 5 ================\n");    
+    InstSiteInfo* siteInfo = InstInfo::findSiteInfo(gbte->siteID, gbte->gbtType);
+    siteInfo->invokeFunctions();
+    DEBUG_MSG(1, "================ End Phase 5 ================\n");
+}
+
+//////////////// InstSiteInfo implementation ////////////////
+
+InstFunctionInfo* InstSiteInfo::push_back(unsigned retValBytes,
+                                          void* func)
+{
+    DEBUG_MSG(3, "Inside InstSiteInfo::push_back, registering func w/ address "
+              << HEX(func) << ", new retVal of size " << retValBytes << endl);
+
+    InstFunctionInfo* fi = 0;
+
+    if(!isRegistered(func)) {
+        DEBUG_MSG(3, "not yet registered, registering...\n");
+
+        void* retVal = static_cast<void*>(new char[retValBytes]);
+        m_registeredFuncs.insert(func);
+        m_instFuncInfos.push_back(fi = new InstFunctionInfo(retVal, func));
+        installBranch();
+    }
+    else
+        DEBUG_MSG(3, "WARNING: Attempt to register instrumentation at site that was already instrumented\n");
+
+    return fi;
+}
 
-            void (*instFunc)(void*, void*) = (void (*)(void*, void*)) gbte->instFunc;
-            instFunc(gbte->retVal, ppGBT[gbte->gbtStartIdx].retVal);
+void InstSiteInfo::push_back(void* retVal,
+                             void* func,
+                             InstFunctionInfo* startInfo)
+{
+    DEBUG_MSG(3, "Inside InstSiteInfo::push_back, registering func w/ address "
+              << HEX(func) << ", retVal addr " << HEX(retVal) << endl);
+
+    if(!isRegistered(func)) {
+        DEBUG_MSG(3, "not yet registered, registering...\n");
+
+        m_registeredFuncs.insert(func);
+        m_instFuncInfos.push_back(new InstFunctionInfo(retVal, func, startInfo));
+        installBranch();
+    }
+    else
+        DEBUG_MSG(3, "WARNING: Attempt to register instrumentation at site that was already instrumented\n");
+}
+
+void InstSiteInfo::invokeFunctions()
+{
+    assert(m_branchInstalled && m_instrumented && "Invoking functions not permitted");
+
+    for(int i = 0, e = m_instFuncInfos.size(); i < e; ++i) {
+        DEBUG_MSG(3, "Invoking function " << i << endl);
+        m_instFuncInfos[i]->invoke();
+    }
+}
+
+void InstSiteInfo::installBranch()
+{
+    if(!m_branchInstalled && m_instrumented && m_instFuncInfos.size() > 0) {
+        DEBUG_MSG(3, "(InstSiteInfo::installBranch) Installing branch...\n");
+        VirtualMem* vm = InstInfo::instance()->getVM();
+        vm->writeInstToVM(m_brInstallAddr, m_branchInst);
+        m_branchInstalled = true;
+    }
+    else
+        DEBUG_MSG(3, "(InstSiteInfo::installBranch) Not installing branch, it's not needed\n");
+}
+
+//////////////// InstFunctionInfo implementation ////////////////
 
-            break;
+void InstFunctionInfo::invoke()
+{
+    // In the case of start-interval functions, the boolean m_invoked is set to true upon
+    // completion of the invocation.  This same boolean (i.e, the one associated with the
+    // start-interval InstFunctionInfo instance) is cleared by the corresponding
+    // end-interval function invocation.  This is to ensure that no end-interval
+    // instrumentation function is ever invoked unless its corresponding start-interval
+    // instrumentation function has been invoked (which implies that the start-interval
+    // site has been handled properly and all of the registration mechanisms).
+
+    DEBUG_MSG(3, "(InstFunctionInfo::invoke) retVal address is: " << HEX(m_pRetVal) << endl);
+    if(m_pStart) {
+        DEBUG_MSG(3, "End-interval inst func detected\n");
+        void (*instFunc)(void*, void*) = (void (*)(void*, void*)) m_pInstFunc;
+
+        if(m_pStart->m_invoked) {
+            DEBUG_MSG(4, "Corresponding start function has been invoked, so invoking this inst func\n");
+            instFunc(m_pRetVal, m_pStart->m_pRetVal);
+            m_pStart->m_invoked = false;
         }
+        else
+            DEBUG_MSG(4, "Corresponding start func has not been invoked, so not invoking\n");
+    }
+    else {
+        DEBUG_MSG(3, "Start-interval or non-end inst func detected\n");
+        void (*instFunc)(void*) = (void (*)(void*)) m_pInstFunc;
+        instFunc(m_pRetVal);
+        m_invoked = true;
     }
+        
+    DEBUG_MSG(3, "(InstFunctionInfo::invoke) instrumentation function returned\n");
+}
+
+//////////////// register{Interval,Point}Inst implementation ////////////////
 
-    DEBUG_MSG(1, "--- phase 5 invocation completed ---\n" << std::flush);
+extern "C" void registerIntervalInst(unsigned siteID,
+                                     void* startFunc,
+                                     void* endFunc,
+                                     unsigned paramSize,
+                                     void* retVal)
+{
+    InstInfo::SiteInfoPair* sip = InstInfo::instance()->findSiteInfo(siteID);
+    assert(sip && "Unable to obtain SiteInfoPair");
+    
+    // Handle start function
+    DEBUG_MSG(3, "registerIntervalInst: Registering start function...\n");
+    InstFunctionInfo* fi = sip->first.push_back(paramSize, startFunc);
+
+    if(fi) {
+        // Handle end function
+        DEBUG_MSG(3, "registerIntervalInst: Registering end function...\n");
+        sip->second.push_back(retVal, endFunc, fi);
+    }
+    else
+        DEBUG_MSG(3, "WARNING: Register-start-function returned 0, which implies redundant registration");
 }


Index: llvm/lib/Reoptimizer/Inst/lib/SparcInstManip.cpp
diff -u llvm/lib/Reoptimizer/Inst/lib/SparcInstManip.cpp:1.16 llvm/lib/Reoptimizer/Inst/lib/SparcInstManip.cpp:1.17
--- llvm/lib/Reoptimizer/Inst/lib/SparcInstManip.cpp:1.16	Wed May 28 08:52:41 2003
+++ llvm/lib/Reoptimizer/Inst/lib/SparcInstManip.cpp	Tue Jun 24 10:39:53 2003
@@ -242,14 +242,6 @@
 
     DEBUG_MSG(2, "buildPhase5HeapSlot completed\n");
 
-    // If we're dealing with a start-interval instrumentation function, heap-allocate
-    // its parameter memory
-
-    if(gbte->gbtType == pp::GBT_INTERVAL_START) {
-        assert(!gbte->retVal && "Expected null retVal value");
-        gbte->retVal = static_cast<void*>(new char[gbte->paramSize]);
-    }
-
     ////////////////
     // Construct the phase 5 jump slot
 
@@ -363,7 +355,7 @@
                         unsigned sizeBytes,
                         int protBits)
 {
-    DEBUG_MSG(3, "Setting access bits on heap slot page(s)" << endl);
+    DEBUG_MSG(4, "Setting access bits on heap slot page(s)" << endl);
 
     int rc = mprotect(pageBase, sizeBytes, protBits);
     if(rc < 0) {
@@ -416,10 +408,10 @@
 
     setPageBits(heapSlot, numHeapBytes, PROT_READ | PROT_WRITE | PROT_EXEC);
 
-#if VERBOSE > 2
-    DEBUG_MSG(3, "Dumping contents of heap-slot memory...\n");
+#if VERBOSE > 3
+    DEBUG_MSG(4, "Dumping contents of heap-slot memory...\n");
     dumpHeapSlot(heapSlot, getPhase5HeapSize(), this);
-    DEBUG_MSG(3, "Done with heap region construction, moving on to jump slot\n");
+    DEBUG_MSG(4, "Done with heap region construction, moving on to jump slot\n");
 #endif
 
     return heapSlot;
@@ -462,7 +454,7 @@
         return addr + getInstWidth();
     }
     else {
-        DEBUG_MSG(2, "WARNING: Non-save instruction at function entry\n");
+        DEBUG_MSG(4, "WARNING: Non-save instruction at function entry\n");
         return addr;
     }
     
@@ -569,7 +561,7 @@
                                                m_logicalToActualReg[useForIndirect],
                                                offset));
 
-    DEBUG_MSG(3, "JMPL instruction word is " << HEX(m_pCurrSnippet->back()) << endl);
+    DEBUG_MSG(4, "JMPL instruction word is " << HEX(m_pCurrSnippet->back()) << endl);
     m_pCurrSnippet->push_back(getNOP());
 
     assert(m_pCurrSnippet->size() - initSize == GEN_JMPL_SIZE &&


Index: llvm/lib/Reoptimizer/Inst/lib/design.txt
diff -u llvm/lib/Reoptimizer/Inst/lib/design.txt:1.15 llvm/lib/Reoptimizer/Inst/lib/design.txt:1.16
--- llvm/lib/Reoptimizer/Inst/lib/design.txt:1.15	Sun May 18 12:45:26 2003
+++ llvm/lib/Reoptimizer/Inst/lib/design.txt	Tue Jun 24 10:39:53 2003
@@ -880,91 +880,70 @@
 
 {{{ MILESTONES
 
+  - Experiments
+  - The Paper
+  - The Thesis
 
 }}}
 
 {{{ TODO
 
-    - Move statically-sized spill regions so that they are internal to SparcInstManip.
-      (do not need variable-sized spill region except for phase5 invocations)
+    In priority order:
 
-    - Start table-of-stacks implementation for phase4 authorship of phase 5 slots.
-      - Placed on hold temporary because of "alloca-finding" approach. However, see the 
-        following e-mail for the current state of things:
-
-        {{{ E-mail regarding alloca-finding and table-of-stacks approach
-Okay, this is starting to seem intractable. I have another problem that
-I don't think can be resolved without resorting to a custom-stack
-mechanism that will incur prohibitive overhead.
-
-Everything is working for start-region instrumentation sites.  For
-end-region instrumentation sites, however, there's a problem. In order
-to write the slot for end sites, I have to know (or know how to compute)
-the address of the return value of the corresponding start site. I had
-originally thought that I would just store this in the GBT, or
-"something", but I clearly didn't think through the problem well enough.
-
-There are only two ways I can think of that this can occur:
-
-(a) Write the effective address of the return value of the start inst
-func, so that it gets passed to the end inst func.
-
-or
-
-(b) Somehow encode the stack offset to the return value from the start
-inst, where the offset is from the %sp *at the end-region site*
-
-Both of these have problems.
-
-First, I don't think (b) can work at all, given that there may be
-alloca's present in the original application that would change the %sp,
-and thus the offset value that we'd need, and we can't determine the
-exact allocas that are executed statically.
-
-For (a), the effective address isn't known until runtime. We can store
-this address in some global table where the phase 4 invocation for the
-end site can find it, but it is not sufficient to have a single scalar
-address here -- we must have a stack, due to potential recursive
-invocations. I think that this is clear, please let me know if I'm not
-making sense. :) 
-
-Hence, we'd need to maintain a stack of effective addresses, which was
-pushed during the execution of phase 5 for the start site, and then read
-and popped during the execution of phase 5 for the end site.  We're
-already really bloated with how many instructions we've got going on for
-all of the spills, etc, and I'm concerned about the effect that this
-stack manipulation will have on our overhead, as we talked about before.
-
-The way I see it, we only have two options if we're to make forward
-progress and not obliterate our chances of having lower overhead
-numbers. Hopefully we have some better choices. In the interests of
-short-term forward progress, I'm going to go with #1 for now.
-
-#1 - Make another common-case assumption that there will be no allocas
-between start and end sites, on *any* control path. If this is the case,
-then we know that the stack pointer will not have been manipulated (I
-think) between the start and end sites, and so the %sp offsets to the
-requisite data will be unchanged since when the phase 5 step occurred
-for the start site.  
-
-#2 - Just implement our fall-back solution that everything seems to be
-pointing to. I'm not sure exactly what other logistic nightmares might
-be entailed in this, though, because I've only a sketch of the idea.
-
-I wanted to point out, also, that the so-called "fall back" approach we
-discussed previous also involves manipulation of a stack at runtime
-(push/pop actions still have to occur at runtime), so perhaps the stack
-of effective addresses is less prohibitive than I thought, if only in
-the sense that we cannot avoid it. :(
-        }}}
-
-    - Write phase 5 stuff for end-region sites -- will assume that not allocas lie between
-    the start and end sites, which is not particularly a fair assumption.
-
-    - Optimizations:
-        - No need to save registers (other than those clobbered) in phase 3 slot, since phase 3
-          is invoked at the start of the function. Must still spill/restore shared, though.
-        - No need to save registers (other than those clobbered) in general.
+    1) New implementation for instrumenting at function-level granularity
+    2) Apache through LLVM, experiments
+    3) Writing, writing, writing: ICS version 2 paper, do
+       a) outline
+       b) intro
+       c) language section
+       d) compiler section
+
+    The three top-level items above are more-or-less interchangable. However, the
+    experiments will not be able to be completed unless the "instrumentation @
+    function-level granularity" implementation is done, and there should be an emphasis on
+    getting Apache through LLVM in the short-term because Chris is leaving for Norway in
+    early July.
+
+    However, *all* of the writing (including experimentation sections) for ICS v2 must be
+    done by the end of June, so that Adve can approve it, make the desired corrections,
+    etc., so that I can get started on thesis authorship and submission.  Realistically,
+    the timeframe should look something like this:
+
+    Week of 6/10 (5 days): Implementation, Apache w/ bug reports
+    Week of 6/16 (6 days): Implementation, Small example, continue Apache & do tests
+
+    -- At this point, all experiments should be more-or-less completed --
+
+    Week of 6/23 (6 days): Write, write, write. 3 days nonstop for content, 3
+                           days of Adve making corrections, etc.
+
+    Monday, 6/30 is D-Day...
+
+    Schedule revision 13 Jun 2003: Apache is in stasis waiting to hear back from
+    Chris. Some effort might be expended to see if any more work can be done on compiling
+    Apache even though the build process currently fails altogether.  Implementation for
+    instrumentation at function-level granularity is in stasis until I hear back from
+    Vikram.  This leaves three immediate options open until either of the two are
+    resolved, in which case forward progress on either the implementation or Apache should
+    be made.
+
+       Option 1) POV-Ray through LLVM.
+       Option 2) Writing
+       Option 3) "Small example"
+
+    Option 3 really shouldn't be undertaken until we know if obtaining things like I/O
+    elapsed time (via function-level) is possible, or until I can talk with Vikram in our
+    next meeting about what the heck this nebulous example should look like...this leaves
+    only options 1 and 2 above as viable.
+
+    POV-Ray should be an easy thing to start, and would be good both as a fall-back if
+    Apache isn't possible as well as a useful additional example if the latter is
+    possible.  This would also give V & C some time to respond to the pending queries.
+
+    - Optimizations: - No need to save registers (other than those clobbered) in
+    phase 3 slot, since phase 3 is invoked at the start of the function. Must
+    still spill/restore shared, though.  - No need to save registers (other than
+    those clobbered) in general.
 
 }}}
 
@@ -1460,5 +1439,226 @@
 Also, Chris remarked that any novel page-management mechanisms, etc., (for the
 code that is jumped to in newly-allocated pages) that I devise should perhaps be
 integrated into the LLVM JIT if they are suitable.
+
+}}}
+
+{{{ Experiments
+
+We must devise experiments that demonstrate the novel aspects of the work.  We
+are currently planning on using Apache and/or POV-Ray, and demonstrating how a
+"deep" performance analysis can be encoded using performance primitives. A
+"deep" performance analysis is one which essentially (using Hollingsworth's
+terminology from his PhD thesis) gets at the "what, where, and when" aspects of
+performance bottlenecks.  However, instead of doing this at the "arbitrary
+program during execution" level, as the W^3 search model does, we will encode
+these performance aspects at the application level itself.
+
+Here's what Vikram suggested for a good start:
+
+  I just thought that the examples of performance issues he explores in his
+  automatic search would give you (a) some insights into what performance issues
+  make sense to consider, and (b) some ideas about how to do a systematic
+  diagnosis, albeit at the application level.
+  
+  But isn't detecting the "what, why, and when" of performance bottlenecks
+  pretty closely related to the goal of performance diagnosis?  We'd be looking
+  for bottlenecks too, except that we can use application domain information, we
+  can look for bottlenecks at the algorithmic level instead of the general
+  system level, and we can record it permanently in the program as a first class
+  feature of the program.
+  
+  Anyway, about your second question: here's a way to do what I suggesting:
+  
+  -- think about 2-3 key performance issues with Apache (or POV-Ray) that you'd
+  want to diagnose e.g., cache misses, TLB misses, thread overhead (estimating
+  that could be interesting), I/O delays
+  
+  -- if those issues make sense with a small sort example, try to diagnose those
+  issues in the small example first.  e.g., I think cache misses, TLB misses,
+  and I/O delays would all be issues if you were sorting a huge file of some
+  kind.
+  
+  This is purely to give you a small, well-understood code to try out before
+  going to the big ones where it may be difficult to know, when one diagnosis
+  attempt fails, whether it failed because you misunderstood the performance
+  issues or because the guess was wrong or both.
+
+TLB misses aren't an option, because we cannot get at them with PAPI. Cache
+misses are available, so that'd certainly be a good place to start.  As for I/O
+delays, I have no idea how we'd measure this, either.  The following is the
+comprehensive list of the low-level metrics that are exposed to us via PAPI.  On
+our own, we can support simple things like elapsed time, load average, etc.  I'm
+not altogether clear how we'd determine how much time (for a particular region)
+was spent doing I/O-bound activities...
+
+Number of hardware counters: 2
+Name: PAPI_L1_ICM 	Description: Level 1 instruction cache misses
+Name: PAPI_L2_TCM 	Description: Level 2 cache misses
+Name: PAPI_CA_SNP 	Description: Requests for a snoop
+Name: PAPI_CA_INV 	Description: Requests for cache line invalidation
+Name: PAPI_L1_LDM 	Description: Level 1 load misses
+Name: PAPI_L1_STM 	Description: Level 1 store misses
+Name: PAPI_BR_MSP 	Description: Conditional branch instructions mispredicted
+Name: PAPI_TOT_IIS 	Description: Instructions issued
+Name: PAPI_TOT_INS 	Description: Instructions completed
+Name: PAPI_LD_INS 	Description: Load instructions
+Name: PAPI_SR_INS 	Description: Store instructions
+Name: PAPI_TOT_CYC 	Description: Total cycles
+Name: PAPI_IPS    	Description: Instructions per second
+Name: PAPI_L1_DCR 	Description: Level 1 data cache reads
+Name: PAPI_L1_DCW 	Description: Level 1 data cache writes
+Name: PAPI_L1_ICH 	Description: Level 1 instruction cache hits
+Name: PAPI_L2_ICH 	Description: Level 2 instruction cache hits
+Name: PAPI_L1_ICA 	Description: Level 1 instruction cache accesses
+Name: PAPI_L2_TCH 	Description: Level 2 total cache hits
+Name: PAPI_L2_TCA 	Description: Level 2 total cache accesses
+
+So, in the short-term, we have two outstanding problems. First, what kinds of
+metric would we want to apply to Apache/POV-Ray/Simple example? Second, of those
+metrics, which can we actually realize with the current system? Third, we should
+create the simple program (such as sorting large amounts of data from a file,
+etc.) so that it can use these metrics in such a way that "models" or
+"anticipates" the way they will be used in the bigger applications.
+
+This is an outstanding issue, and I don't really know where to go with it yet.
+
+More notes about this as of 4 Jun 2003:
+
+One way to obtain metric values that are based on the elapsed times of
+particular functions is to somehow register instrumentation for those particular
+functions, and for a particular region -- Vikram argues that we have the ability
+to do this dynamically and we don't need any markers or phase-1 actions because
+we're operating at function-level granularity. 
+
+Here is a sample scenario: We have defined an interval I over some scoped region
+of code.  During phase 1 and phase 2, no instrumentation is registered for this
+interval.  Later on, we construct a metric that is qualified by a list of
+functions that (for example) are to have their runtimes measured and added to
+some running total.  Let's call this the "measure_functions_start" and
+"measure_functions_end" metric, and have it yield a value of type double which
+is the aggregate runtime of the list of functions when they get executed within
+interval I.  The metric registration function will have to have some way
+(varargs?) of denoting what the functions are: perhaps it can simply pass in an
+array of function names together with a size.
+
+Example: 
+
+pp_registerIntervalInst(intervalID, measure_functions_start,
+                        measure_functions_end, &retVal, sizeof(double), 
+                        func1, func2, func3, ...);
+
+However, what do "measure_functions_start" have to do with anything? More than
+likely, what we need to do is specify a particular metric to apply to a
+paritcular function, such that the value will be sampled each time that function
+gets executed.  Then, since there can be multiple invocations (and hence,
+multiple samples) for this selected function within I, we will have to have some
+default or user-specified way of aggregating the data :(.  This is gross.  In
+other words, we should probably simplify the above to something like:
+
+pp_registerIntervalInst(intervalID, some_metric_start, some_metric_end, 
+                        &retVal, sizeof(double), HOW_TO_AGGREGATE, func1);
+
+Where func1 is the function to be instrumented and HOW_TO_AGGREGATE is some
+value that specifies one from a couple of ways of combining the data.  For now,
+HOW_TO_AGGREGATE will not exist and will implicitly sum all return
+values...hence, if the some_metric_{start,end} function ptrs above were to
+elapsed time, at the end of the interval I with interval id intervalID, retVal
+would contain the combined elapsed time of all time spent in function func1.
+
+Enabling this measurement at the start of the interval, disabling at the end of
+the interval; clearly, the start-instrumentation site will need to transform
+func1 (to compute the metric value) when crossed, and the end-insturmentation
+site must remove such instrumentation when crossed. This doesn't follow the
+normal model of what occurs at the instrumentation sites, in terms of just
+ripping down a list of functions...or does it? Perhaps one of the functions in
+the list is just this function that does the transformation on the target
+function...in this case, the process would look something like:
+
+1. Register the transformation function (for the start point) -- call this
+xform_start -- as a regular instrumentation function.  The runtime call to do
+the registration will build the appropriate data structures which will encode
+what metric to associated with the target function, aggregation method, return
+value, etc.
+
+2. Register the transformation function (for the end point) -- call this
+xform_end -- as a regular instrumentation function.
+
+3. When xform_start is invoked as a regular instrumentation function, it will
+instrument the target function with the selected instrumentation.
+
+This is the hardest step to conceptualize and realize.  The problem is that,
+without any placeholders from phase 1, it's not clear that we can instrument the
+target function easily.  Clearly, our instrumentation points are at the start
+and end points of the function (entry and exit).  But this is not really true.
+Our instrumentation points are really at the entry and *all* function exits.
+
+The important question is, what if we have all of the exit points at our
+disposal? Would that change anything?
+
+It would. The entry point together with all exit points would form a set of
+instrumentation points. For each of these instrumentation points, we could
+over-write a branch to a new slot that would call the desired instrumentation
+function, restore the replaced instruction, and return to the instrumentation
+point to continue execution.  This would potentially work. One major problem
+that comes to mind is that for system calls (such as read()), the body of the
+function is highly likely to be out of short-jmp range to the
+tracecache-allocated slot.  The only way around this would be to create a heap
+region and copy the target function into it, etc.  We don't have any code to do
+this yet, so again there is no code reuse or (much) leveraging of existing
+functionality.  Additionally, work will have to be done to make the data
+structure that maps the address of a function to its extents (which come from
+the ELf information).
+
+--
+
+The other alternative is to make sure a "wrapper" function (i.e., for the
+function read()):
+
+int read_wrapper(args for read) {
+  start_inst();
+  int rc = read(args for read);
+  end_inst();
+  return rc;
+}
+
+But this isn't an option because we cannot locate the calls to read() to replace
+them with the wrapper.  We could do the following:
+
+3a. Copy the entire target function to a heap region, and instrument it to our
+heart's content.  However, finding exit points may not be easy without a CFG,
+etc.
+
+3b. Replace the body of the real target function with a call to the modified
+duplicate of the target function, returning whatever the modified duplicate
+returns.
+
+This works, I think, but is incredibly cumbersome and, contrary to what was
+previously discussed, we do *not* possess all of the mechanisms.
+
+--
+
+Notes on POV-Ray experiment 23 Jun 2003
+
+We propose using a user-defined metric, flops/display pixel to compute the
+computational cost per pixel.
+
+1. Compute flops/dpixel and report using pp mechanisms.
+  1a. flops can be obtained via PAPI, IIRC.
+
+2. Create a moving average of flops/dpixel.
+
+3. When the flops/dpixel exceeds mavg flops/dpixel by some multiplicative
+threshold (e.g.), a performance assertion is violated.  When the PA is violated,
+we report other metrics (perhaps ranked in some manner) that had been being
+recorded but not reported.  Initial suggestions for the other metrics to measure
+_over the same region_ (say, the main routine for a single ray-trace) are:
+
+    1) MAverage L1 icache misses vs. this-ray L1 icache misses
+    2) MAverage L2 cache misses vs. this-ray L2 cache misses
+    3) MAverage load count vs. this-ray load count
+    4) MAverage store count vs. this-ray store count
+
+Immediate problem: I don't think we can monitor more than 2 of these values
+concurrently due to hardware limitations on the SPARC...
 
 }}}