[llvm-commits] CVS: llvm/lib/Reoptimizer/Inst/lib/InstManip.h PhaseInfo.h Phases.cpp SparcInstManip.cpp SparcInstManip.h design.txt

Mon May 12 21:01:48 PDT 2003

Changes in directory llvm/lib/Reoptimizer/Inst/lib:

InstManip.h updated: 1.16 -> 1.17
PhaseInfo.h updated: 1.4 -> 1.5
Phases.cpp updated: 1.26 -> 1.27
SparcInstManip.cpp updated: 1.6 -> 1.7
SparcInstManip.h updated: 1.7 -> 1.8
design.txt updated: 1.13 -> 1.14

---
Log message:

Final checkin before abandoning the alloca approach altogether. Present for posterity.


---
Diffs of the changes:

Index: llvm/lib/Reoptimizer/Inst/lib/InstManip.h
diff -u llvm/lib/Reoptimizer/Inst/lib/InstManip.h:1.16 llvm/lib/Reoptimizer/Inst/lib/InstManip.h:1.17

--- llvm/lib/Reoptimizer/Inst/lib/InstManip.h:1.16	Thu May  8 11:27:25 2003
+++ llvm/lib/Reoptimizer/Inst/lib/InstManip.h	Mon May 12 21:00:22 2003
@@ -26,6 +26,8 @@
 #include <map>
 #include <vector>
 
+#include "Phase1/Intraphase.h"
+
 class TraceCache;
 class Phase2;
 class Phase3;
@@ -33,7 +35,7 @@
 class InstCandidate;
 class Phase3Info;
 class Phase4Info;
-struct PrimInfo;
+struct GBTElem;
 
 class InstManip 
 {
@@ -69,7 +71,7 @@
                                std::vector<unsigned>& snippet) = 0;
 
     // For the phase 5 slot
-    virtual void     buildSlot(PrimInfo* pi,
+    virtual void     buildSlot(GBTElem* gbte,
                                uint64_t slotBase,
                                uint64_t instAddr,
                                const std::pair<uint64_t, uint64_t>& extents,
@@ -77,7 +79,7 @@
 
     virtual unsigned getSlotSize(Phase2* p2) const = 0;
     virtual unsigned getSlotSize(Phase3* p3, InstCandidate& cand) const = 0;
-    virtual unsigned getSlotSize(Phase4* p4) const = 0;
+    virtual unsigned getSlotSize(Phase4* p4, pp::GBTEntryType type) const = 0;
 
     // findCandidates - Build the vector of instruction candidates that occur in the
     // region defined by the given addresses. This is necessarily a platform-dependent


Index: llvm/lib/Reoptimizer/Inst/lib/PhaseInfo.h
diff -u llvm/lib/Reoptimizer/Inst/lib/PhaseInfo.h:1.4 llvm/lib/Reoptimizer/Inst/lib/PhaseInfo.h:1.5
--- llvm/lib/Reoptimizer/Inst/lib/PhaseInfo.h:1.4	Fri May  9 23:01:50 2003
+++ llvm/lib/Reoptimizer/Inst/lib/PhaseInfo.h	Mon May 12 21:00:22 2003
@@ -15,18 +15,20 @@
 
 #define DEBUG 1
 #if DEBUG
-#define DEBUG_MSG(x) std::cerr << x
+#define VERBOSE 1
+#define DEBUG_MSG(v, x) if(VERBOSE >= v) std::cerr << x
 #else
-#define DEBUG_MSG(x)
+#define DEBUG_MSG(v, x)
 #endif
 
 typedef std::pair<uint64_t, uint64_t> AddressRange;
 
-typedef struct PrimInfo {
+typedef struct GBTElem {
     unsigned gbtType;
     unsigned short* loadVar;
     unsigned gbtStartIdx;
     unsigned paramSize;
+    void*    instFunc;
 };
 
 class Phase3Info 


Index: llvm/lib/Reoptimizer/Inst/lib/Phases.cpp
diff -u llvm/lib/Reoptimizer/Inst/lib/Phases.cpp:1.26 llvm/lib/Reoptimizer/Inst/lib/Phases.cpp:1.27
--- llvm/lib/Reoptimizer/Inst/lib/Phases.cpp:1.26	Fri May  9 23:01:50 2003
+++ llvm/lib/Reoptimizer/Inst/lib/Phases.cpp	Mon May 12 21:00:23 2003
@@ -2,7 +2,7 @@
 // programmer: Joel Stanley
 //       date: Fri Apr  4 16:59:48 CST 2003
 //     fileid: Phases.cpp
-//    purpose: Implements various runtime phases of the peformance-oriented language
+//    purpose: Implements runtime phases 2-5 of the peformance-oriented language
 //             extensions.
 //
 // PHASE 2:
@@ -78,7 +78,7 @@
 // obtained in the same manner.
 
 extern unsigned ppGBTSize;
-extern PrimInfo ppGBT[];
+extern GBTElem ppGBT[];
 
 typedef std::pair<uint64_t, uint64_t> AddressRange;
 
@@ -161,10 +161,10 @@
 
 void Phase2::transform()
 {
-    DEBUG_MSG("============================== Begin Phase 2 ==============================\n");
+    DEBUG_MSG(1, "============================== Begin Phase 2 ==============================\n");
     
     const char* execName = getexecname();
-    DEBUG_MSG("Executable name is: " << execName << endl);
+    DEBUG_MSG(1, "Executable name is: " << execName << endl);
 
     ElfReader elfReader(execName);
     
@@ -179,7 +179,7 @@
     while(elfReader.findNextSymbol(funcName, range, m_pIM->getInstWidth()))
         funcs.push_back(std::make_pair(funcName, range));
 
-    DEBUG_MSG("There are " << funcs.size() << " functions to process." << endl);
+    DEBUG_MSG(1, "There are " << funcs.size() << " functions to process." << endl);
 
     m_pIM->makePhase3SpillRegion(funcs.size());
     
@@ -191,13 +191,13 @@
             //cerr << i->first << " is to be transformed" << endl;
 
             if(i->first == "fibs") {
-                DEBUG_MSG("Transforming function " << i->first << "...\n");
+                DEBUG_MSG(1, "Transforming function " << i->first << "...\n");
                 transformFunction(i->second);
             }
         }
     }
 
-    DEBUG_MSG("============================== End Phase 2 ===========================\n");
+    DEBUG_MSG(1, "============================== End Phase 2 ===========================\n");
 }
 
 
@@ -262,8 +262,8 @@
     vector<unsigned> snippet;
     m_pIM->buildSlot(p3info, snippet);
 
-#if DEBUG
-    DEBUG_MSG("phase3 slot instructions:\n");
+    DEBUG_MSG(2, "phase3 slot instructions:\n");
+#if VERBOSE > 1
     dumpSnippet(snippet, m_pIM);
 #endif
 
@@ -284,7 +284,7 @@
     m_pTC(p3info->getTraceCache()),
     m_pIM(p3info->getIM())
 {
-    DEBUG_MSG("================ Begin Phase 3 [" << std::hex
+    DEBUG_MSG(1, "================ Begin Phase 3 [" << std::hex
               << m_pPhase3Info->getRange().first << ", "
               << m_pPhase3Info->getRange().second
               << "] ================\n");
@@ -319,26 +319,26 @@
     // For each load candidate, obtain a new slot and write the phase 4 slot region
     // contents into it.
 
-    DEBUG_MSG("There are " << candidates.size() << " candidates to process\n");
+    DEBUG_MSG(1, "There are " << candidates.size() << " candidates to process\n");
 
-    for(vector<InstCandidate>::iterator i = candidates.begin(), e = candidates.end(); i != e; ++i) {
-        DEBUG_MSG("Transforming " << *i << endl);
-        unsigned slotSize = m_pIM->getSlotSize(this, *i);
+    for(unsigned i = 0, e = candidates.size(); i < e; ++i) {
+        DEBUG_MSG(1, "Transforming " << candidates[i] << endl);
+        unsigned slotSize = m_pIM->getSlotSize(this, candidates[i]);
 
         // Replace load candidate instruction with a branch to the start of a new slot.
-        uint64_t slotBase = replaceInstWithBrToSlot(i->front().first, slotSize,
+        uint64_t slotBase = replaceInstWithBrToSlot(candidates[i].front().first, slotSize,
                                                     m_pTC, m_pIM);
 
         // Build the Phase4Info structure and generate the phase 4 slot.
 
-        Phase4Info* p4info = new Phase4Info(*i, m_pPhase3Info->getRange(),
+        Phase4Info* p4info = new Phase4Info(candidates[i], m_pPhase3Info->getRange(),
                                             slotBase, slotSize, m_pTC, m_pIM);
 
         vector<unsigned> snippet;
         m_pIM->buildSlot(p4info, snippet);
 
-#if DEBUG
-        DEBUG_MSG("phase4 slot instructions:\n");
+        DEBUG_MSG(2, "phase4 slot instructions:\n");
+#if VERBOSE > 1
         dumpSnippet(snippet, m_pIM);
 #endif
 
@@ -346,7 +346,7 @@
         copySnippetToSlot(snippet, slotBase, m_pTC->getVM(), m_pIM);
 
         // just one candidate for now
-        break;
+        //break;
     }
 }
 
@@ -358,7 +358,7 @@
 
     // ...and process them
     processCandidates(candidates);
-    DEBUG_MSG("============================== End Phase 3 ==============================\n");
+    DEBUG_MSG(1, "============================== End Phase 3 ==============================\n");
 }
 
 //////////////// Phase4 implementation ////////////////
@@ -375,7 +375,7 @@
     m_pIM(p4info->getIM()),
     m_tag(tag)
 {
-    DEBUG_MSG("================ Begin Phase 4 ================\n");
+    DEBUG_MSG(1, "================ Begin Phase 4 ================\n");
 }
 
 Phase4::~Phase4() 
@@ -407,7 +407,7 @@
     }
 }
 
-static PrimInfo* searchGBT(uint64_t tag)
+static GBTElem* searchGBT(uint64_t tag)
 {
     // Traverse the GBT and determine if the tag is there.
     for(unsigned i = 0; i < ppGBTSize; ++i) {
@@ -421,13 +421,13 @@
 void Phase4::transform()
 {
 
-#if DEBUG
-    //dumpGBT(cerr);
-    DEBUG_MSG("tag is " << m_tag << ", and ");
+#if VERBOSE > 0
+    dumpGBT(cerr);
 #endif
+    DEBUG_MSG(1, "tag is " << m_tag << ", and ");
 
-    if(PrimInfo* pi = searchGBT(m_tag)) {
-        DEBUG_MSG("matches.\n");
+    if(GBTElem* gbte = searchGBT(m_tag)) {
+        DEBUG_MSG(1, "matches.\n");
 
         const InstCandidate& cand = m_pPhase4Info->getCandidate();
         assert(cand.getInsts().size() >= 2
@@ -444,15 +444,15 @@
 
         // Obtain memory (& rewrite branch) to the phase 5 slot.
         
-        unsigned slotSize = m_pIM->getSlotSize(this);
+        unsigned slotSize = m_pIM->getSlotSize(this, (pp::GBTEntryType) gbte->gbtType);
         uint64_t repAddr = cand.front().first;
         uint64_t slotBase = replaceInstWithBrToSlot(repAddr, slotSize, m_pTC, m_pIM);
 
         vector<unsigned> snippet;
-        m_pIM->buildSlot(pi, slotBase, repAddr, m_pPhase4Info->getRange(), snippet);
+        m_pIM->buildSlot(gbte, slotBase, repAddr, m_pPhase4Info->getRange(), snippet);
 
-#if DEBUG
-        DEBUG_MSG("phase 5 slot contents:\n");
+        DEBUG_MSG(2, "phase 5 slot contents:\n");
+#if VERBOSE > 1
         dumpSnippet(snippet, m_pIM);
 #endif
 
@@ -470,7 +470,7 @@
 #endif
     }
     else {
-        DEBUG_MSG("does not match\n");
+        DEBUG_MSG(1, "does not match\n");
         // The candidate failed to get elected, so pack up and go home.  Restore the
         // replaced instruction (i.e. the branch that invoked this code) with the original
         // instruction at that location.
@@ -488,15 +488,36 @@
                                                            m_pPhase4Info->getCandidate().front().second);
 #endif
 
-    DEBUG_MSG("================ End Phase 4 ================\n");
+    DEBUG_MSG(1, "================ End Phase 4 ================\n");
 }
 
 //////////////// Phase 5 implementation ////////////////
 
-void phase5(PrimInfo* pi, void* paramMem)
+void phase5(GBTElem* gbte, void* paramMem, void* startParamMem)
 {
-    DEBUG_MSG("phase5 function invoked\n");
-    DEBUG_MSG("pi->paramSize == " << pi->paramSize << endl);
-    DEBUG_MSG("pi->loadVar (tag) == " << pi->loadVar << endl);
-    DEBUG_MSG("phase 5 function exiting\n");
+    DEBUG_MSG(1, "phase5 function invoked\n");
+    DEBUG_MSG(1, "gbte->gbtType == " << gbte->gbtType << endl);
+    DEBUG_MSG(1, "gbte->paramSize == " << gbte->paramSize << endl);
+    DEBUG_MSG(1, "gbte->loadVar (tag) == " << gbte->loadVar << endl);
+    DEBUG_MSG(1, "Calling instrumentation function...\n");
+
+    switch(gbte->gbtType){
+        case pp::GBT_INTERVAL_START: {
+            DEBUG_MSG(1, "paramMem address is " << paramMem << endl);
+            DEBUG_MSG(1, "sp+BIAS+off is " << startParamMem << endl);
+            void (*instFunc)(void*) = (void (*)(void*)) gbte->instFunc;
+            instFunc(paramMem);
+            break;
+        }
+        case pp::GBT_INTERVAL_END: {
+            DEBUG_MSG(1, "paramMem address is " << paramMem << endl);
+            DEBUG_MSG(1, "sp + BIAS + off address is " << startParamMem << endl);
+            //DEBUG_MSG(1, "startParamMem address is " << startParamMem << endl);
+            //void (*instFunc)(void*, void*) = (void (*)(void*, void*)) gbte->instFunc;
+            //instFunc(paramMem, startParamMem);
+            break;
+        }
+    }
+
+    DEBUG_MSG(1, "phase 5 function exiting\n");
 }


Index: llvm/lib/Reoptimizer/Inst/lib/SparcInstManip.cpp
diff -u llvm/lib/Reoptimizer/Inst/lib/SparcInstManip.cpp:1.6 llvm/lib/Reoptimizer/Inst/lib/SparcInstManip.cpp:1.7
--- llvm/lib/Reoptimizer/Inst/lib/SparcInstManip.cpp:1.6	Fri May  9 23:01:50 2003
+++ llvm/lib/Reoptimizer/Inst/lib/SparcInstManip.cpp	Mon May 12 21:00:23 2003
@@ -42,7 +42,7 @@
 //           | alloc spill area/reg save/inst param region on stack |
 //           |          manually-save clobbered registers           |
 //           |              spill shared registers                  |
-//           |              copy PrimInfo ptr to param 1            |
+//           |              copy GBTElem ptr to param 1             |
 //           |            copy spill area addr to param 2           |
 //           |                   call phase 5                       |
 //           |                        nop                           |
@@ -77,7 +77,7 @@
 
 void phase3(Phase3Info* p3info);
 void phase4(uint64_t tag, Phase4Info* p4info);
-void phase5(PrimInfo* pi, void* paramMem);
+void phase5(GBTElem* gbte, void* paramMem, void* startParamMem);
 
 SparcInstManip::SparcInstManip(TraceCache* tc):
     InstManip(tc, SHARED_SIZE, INST_WIDTH, NOP_INST),
@@ -168,67 +168,23 @@
            "Snippet size does not match expected slot size");
 }
 
-void SparcInstManip::buildSlot(PrimInfo* pi,
+void SparcInstManip::buildSlot(GBTElem* gbte,
                                uint64_t slotBase,
                                uint64_t instAddr,
                                const std::pair<uint64_t, uint64_t>& extents,
                                std::vector<unsigned>& snippet)
 {
-    // Before we generate code to spill the shared registers, we must first search in the
-    // vicinity of the instrumentation site (i.e., the branch to the slot, formerly the
-    // load-volatile) to discover a marker alloca that will tell us the correct offset in
-    // the current stack frame. The search boundaries are given by the 'extents' pair,
-    // which is the address range of the enclosing function.
-
-    unsigned offset = findAllocaOffset(instAddr, extents);
-    unsigned sharedSize = WORD_WIDTH * getSharedSize();
-    unsigned stkSize = sharedSize + WORD_WIDTH * 2 + pi->paramSize;
-
-    if(stkSize % STACK_ALIGN != 0)
-        cerr << "Warning: not multiple of " << STACK_ALIGN << endl;
-
-    DEBUG_MSG("buildSlot(p5) obtained offset " << std::dec
-              << offset << std::hex << endl);
-
-    // After our alloca'd stack region looks like:
-    //  sp + BIAS + stkSize -> +--------------------------------+
-    //                         | inst function parameter memory | } pi->paramSize
-    //                         +--------------------------------+
-    //                         | save area for clobbered regs   | } WORD_WIDTH * 2 
-    //                         +--------------------------------+
-    //                         | spill region for shared regs   | } sharedSize
-    //  sp + BIAS + offset ->  +--------------------------------+
-
-    // TODO: ensure that stack size is aligned properly
-    
-    startCode(snippet);
-
-    generateAlloca(stkSize);
-
-    // "Manually" save REG_0, REG_1
-    generateStackStore(REG_0, offset + sharedSize);
-    generateStackStore(REG_1, offset + sharedSize + WORD_WIDTH);
-
-    generateSpillShared(offset);
-
-    generateLoad((uint64_t) pi, REG_0, REG_1);      // REG_0 live to call
-    generateStackStore(REG_0, PARAM_0);
-
-    generateSPLoad(REG_1, offset + stkSize - pi->paramSize); // REG_1 live to call
-    generateStackStore(REG_1, PARAM_1);
-
-    generateCall((uint64_t) &phase5, slotBase);
-    generateRestoreShared(offset);
-
-    // "Manually" restore REG_0, REG_1
-    generateStackLoad(REG_0, offset + sharedSize);
-    generateStackLoad(REG_1, offset + sharedSize + WORD_WIDTH);
-
-    // We need to branch back to one instruction beyond the branch to the phase 5 slot.
-    generateBranchAlways(instAddr + getInstWidth(), slotBase, getNOP());
-    endCode();
-
-    // TODO: Add assert against against the snippet size.
+    switch(gbte->gbtType) {
+        case pp::GBT_INTERVAL_START:
+            buildStartIntervalSlot(gbte, slotBase, instAddr, extents, snippet);
+            return;
+        case pp::GBT_INTERVAL_END:
+            buildEndIntervalSlot(gbte, slotBase, instAddr, extents, snippet);
+            return;
+        default:
+            assert(0 && "Unhandled gbtType encountered");
+            return;
+    }
 }
 
 unsigned SparcInstManip::getSlotSize(Phase2* p2) const
@@ -266,7 +222,7 @@
         GEN_BRANCH_ALWAYS_SIZE;
 }
 
-unsigned SparcInstManip::getSlotSize(Phase4* p4) const
+unsigned SparcInstManip::getSlotSize(Phase4* p4, pp::GBTEntryType type) const
 {
     // The following sum corresponds to the sizes consumed by the various regions of the
     // the slot constructed by phase 4, called the phase 5 slot. See ASCII diagram of
@@ -280,8 +236,11 @@
         GEN_SPL_STK_SIZE + 
         GEN_LOAD_SIZE +
         GEN_STKSTORE_SIZE +
-        GEN_SPLOAD_SIZE +
+        GEN_SPOFFSET_SIZE +
         GEN_STKSTORE_SIZE +
+        // FIXME
+        //(type == pp::GBT_INTERVAL_END) ?
+        //         GEN_SPOFFSET_SIZE + GEN_STKSTORE_SIZE : 0) +
         GEN_CALL_SIZE +
         GEN_STKSTORE_SIZE +
         GEN_UNSPL_STK_SIZE + 
@@ -338,18 +297,20 @@
     fflush(stdout);
 }
 
-void SparcInstManip::generateSPLoad(LogicalRegister reg, unsigned offset)
+void SparcInstManip::generateSPOffset(LogicalRegister reg, unsigned offset)
 {
-    // Loads the value of %sp + offset into reg
+    // Loads the value of %sp + BIAS + offset into reg
     assert(m_pCurrSnippet && "Invalid snippet for code generation");
     unsigned initSize = m_pCurrSnippet->size();
 
-    m_pCurrSnippet->push_back(MK_ADD_R_I(m_logicalToActualReg[reg], R_O6, offset));
+    m_pCurrSnippet->push_back(MK_ADD_R_I(m_logicalToActualReg[reg], R_O6, offset + BIAS));
 
-    assert(m_pCurrSnippet->size() - initSize == GEN_SPLOAD_SIZE &&
+    assert(m_pCurrSnippet->size() - initSize == GEN_SPOFFSET_SIZE &&
            "Unexpected number of instructions in code sequence for SP load");
 }
 
+// generateAlloca - Generate code to allocate 'size' bytes on the stack
+
 void SparcInstManip::generateAlloca(unsigned size)
 {
     assert(m_pCurrSnippet && "Invalid snippet for code generation");
@@ -537,8 +498,8 @@
 
 void SparcInstManip::generateRestoreShared(unsigned offset)
 {
-    // Un-spill from the stack -- assumes %sp + BIAS + offset points to a valid stack
-    // location.
+    // Generate code to un-spill the shared registers from the memory at
+    // %sp + BIAS + offset. Assumes %sp + BIAS + offset points to a valid stack location.
 
     assert(m_pCurrSnippet && "Invalid snippet for code generation");
     unsigned initSize = m_pCurrSnippet->size();
@@ -553,6 +514,9 @@
                                            LogicalRegister tmp1,
                                            LogicalRegister tmp2) 
 {
+    // Generate code to un-spill the shared registers from restoreFromAddr, using tmp1 and
+    // tmp2 as temporary registers
+
     assert(m_pCurrSnippet && "Invalid snippet for code generation");
     assert(tmp1 != tmp2 && "Distinct logical registers required");
 
@@ -582,9 +546,9 @@
 
 void SparcInstManip::generateSpillShared(unsigned offset)
 {
-    // Spill to the stack -- assumes %sp + BIAS + offset points to a valid stack location,
-    // and that there is sufficient valid memory at %sp + BIAS + offset for the entire
-    // spill size.
+    // Spill the shared registers to the stack (i.e. the memory at %sp + BIAS + offset).
+    // Assumes %sp + BIAS + offset points to a valid stack location, and that there is
+    // sufficient valid memory at %sp + BIAS + offset for the entire spill size.
 
     assert(m_pCurrSnippet && "Invalid snippet for code generation");
     unsigned initSize = m_pCurrSnippet->size();
@@ -599,6 +563,9 @@
                                          LogicalRegister tmp1,
                                          LogicalRegister tmp2)
 {
+    // Generate code to spill the shared registers to spillFromAddr, using tmp1 and tmp2
+    // as temporary registers
+
     assert(m_pCurrSnippet && "Invalid snippet for code generation");
     assert(tmp1 != tmp2 && "Distinct logical registers required");
 
@@ -860,21 +827,22 @@
 
         if(0 == (imm - BIAS) % STACK_ALIGN) {
             offset = imm - BIAS;
-            DEBUG_MSG("Alloca marker case (a)\n");
+            DEBUG_MSG(1, "Alloca marker case (a)\n");
         }
         else if(0 == (imm - BIAS - STACK_ALIGN + 1) % STACK_ALIGN) {
             offset = imm - BIAS - STACK_ALIGN + 1;
-            DEBUG_MSG("Alloca marker case (b)\n");
+            DEBUG_MSG(1, "Alloca marker case (b)\n");
         }
         else
             assert(0 && "Alloca special cases failed, need fallback implementation");
 
-#if DEBUG
-        DEBUG_MSG("Found alloca marker: ");
+        DEBUG_MSG(1, "Found alloca marker: ");
+#if VERBOSE > 0
         sparc_print(inst);
         fflush(stdout);
-        DEBUG_MSG(endl);
 #endif
+        DEBUG_MSG(1, endl);
+
         return true;
     }
     
@@ -917,3 +885,163 @@
     assert(0 && "Failed to find alloca marker");
     return 0;
 }
+
+void SparcInstManip::buildStartIntervalSlot(GBTElem* gbte,
+                                            uint64_t slotBase,
+                                            uint64_t instAddr,
+                                            const std::pair<uint64_t, uint64_t>& extents,
+                                            std::vector<unsigned>& snippet)
+{
+    // Before we generate code to spill the shared registers, we must first search in the
+    // vicinity of the instrumentation site (i.e., the branch to the slot, formerly the
+    // load-volatile) to discover a marker alloca that will tell us the correct offset in
+    // the current stack frame. The search boundaries are given by the 'extents' pair,
+    // which is the address range of the enclosing function.
+
+    unsigned offset = findAllocaOffset(instAddr, extents);
+    unsigned sharedSize = WORD_WIDTH * getSharedSize();
+    unsigned stkSize = sharedSize + WORD_WIDTH * 2 + gbte->paramSize;
+
+    if(stkSize % STACK_ALIGN != 0)
+        cerr << "Warning: not multiple of " << STACK_ALIGN << endl;
+
+    DEBUG_MSG(1, "buildStartSlot obtained offset " << std::dec
+              << offset << ", and stack size is " << stkSize << std::hex << endl);
+
+    // After the alloca, our stack region looks like:
+    //  sp + BIAS + stkSize -> +--------------------------------+
+    //                         | inst function parameter memory | } gbte->paramSize
+    //                         +--------------------------------+
+    //                         | save area for clobbered regs   | } WORD_WIDTH * 2 
+    //                         +--------------------------------+
+    //                         | spill region for shared regs   | } sharedSize
+    //  sp + BIAS + offset ->  +--------------------------------+
+
+    // TODO: ensure that stack size is aligned properly
+    
+    startCode(snippet);
+
+    generateAlloca(stkSize);
+
+    // "Manually" save REG_0, REG_1
+    generateStackStore(REG_0, offset + sharedSize);
+    generateStackStore(REG_1, offset + sharedSize + WORD_WIDTH);
+
+    generateSpillShared(offset);
+    
+    generateLoad((uint64_t) gbte, REG_0, REG_1);      // REG_0 live to call
+    generateStackStore(REG_0, PARAM_0);
+
+    DEBUG_MSG(1, "param1 (from offset+bias) = " << std::dec
+              << (stkSize - gbte->paramSize)
+              << std::hex << endl);
+
+    generateSPOffset(REG_1, offset + stkSize - gbte->paramSize); // REG_1 live to call
+    generateStackStore(REG_1, PARAM_1);
+
+    //tmp -- store %sp + BIAS + offset in third parameter
+    generateSPOffset(REG_2, offset);
+    generateStackStore(REG_1, PARAM_2);
+    //tmp
+
+    generateCall((uint64_t) &phase5, slotBase);
+    generateRestoreShared(offset);
+
+    // "Manually" restore REG_0, REG_1
+    generateStackLoad(REG_0, offset + sharedSize);
+    generateStackLoad(REG_1, offset + sharedSize + WORD_WIDTH);
+
+    // We need to branch back to one instruction beyond the branch to the phase 5 slot.
+    generateBranchAlways(instAddr + getInstWidth(), slotBase, getNOP());
+    endCode();
+
+    // TODO: Add assert against against the snippet size.
+}
+    
+void SparcInstManip::buildEndIntervalSlot(GBTElem* gbte,
+                                          uint64_t slotBase,
+                                          uint64_t instAddr,
+                                          const std::pair<uint64_t, uint64_t>& extents,
+                                          std::vector<unsigned>& snippet)
+{
+    unsigned offset = findAllocaOffset(instAddr, extents);
+    unsigned sharedSize = WORD_WIDTH * getSharedSize();
+    unsigned stkSize = sharedSize + WORD_WIDTH * 2 + gbte->paramSize;
+    
+    if(stkSize % STACK_ALIGN != 0)
+        cerr << "Warning: not multiple of STACK_ALIGN" << endl;
+    
+    DEBUG_MSG(1, "buildEndSlot obtained offset " << std::dec
+              << offset << ", and stack size is " << stkSize << std::hex << endl);
+    
+    // After the alloca, our stack region looks like:
+    //  (current implementation assumes no %sp manipulation occurs between start- and
+    //  end-region sites)
+    //                         +--------------------------------+
+    //                         | ... stk area for start site ...|
+    //  sp + BIAS + stkSize -> +--------------------------------+
+    //                         |   inst function param1 memory  | } gbte->paramSize
+    //                         +--------------------------------+
+    //                         | save area for clobbered regs   | } WORD_WIDTH * 2 
+    //                         +--------------------------------+
+    //                         | spill region for shared regs   | } sharedSize
+    //  sp + BIAS + offset ->  +--------------------------------+
+    //                          
+
+    // TODO: ensure that stack size is aligned properly
+    
+    startCode(snippet);
+    
+    generateAlloca(stkSize);
+    
+    // "Manually" save REG_0, REG_1
+    generateStackStore(REG_0, offset + sharedSize);
+    generateStackStore(REG_1, offset + sharedSize + WORD_WIDTH);
+    
+    generateSpillShared(offset);
+    
+    generateLoad((uint64_t) gbte, REG_0, REG_1); // REG_0 live to call
+    generateStackStore(REG_0, PARAM_0);
+    
+    DEBUG_MSG(1, "param1 (from offset+bias) = " << std::dec
+              << (stkSize - gbte->paramSize)
+              << std::hex << endl);
+    
+    generateSPOffset(REG_1, offset + stkSize - gbte->paramSize); // REG_1 live to call
+    generateStackStore(REG_1, PARAM_1);
+    
+    // Generate code for computing the address of stack location where the return value of
+    // the start site is kept (i.e., -paramSize from the top of the start-region stack
+    // region). Note that we have made the KIS concession that no %sp manipulation may
+    // occur in between start- and end-region sites, and that the size of the return
+    // values for the start and end instrumentation functions are equal.
+
+    DEBUG_MSG(1, "param2 (from offset+bias) = " << std::dec
+              << (2 * stkSize - gbte->paramSize)
+              << std::hex << endl);
+
+#if 0
+    generateSPOffset(REG_2, offset + 2 * stkSize - gbte->paramSize);
+    generateStackStore(REG_2, PARAM_2);
+#endif
+
+    //tmp -- store %sp + BIAS + offset in third parameter
+    generateSPOffset(REG_2, offset);
+    generateStackStore(REG_1, PARAM_2);
+    //tmp
+    
+    generateCall((uint64_t) &phase5, slotBase);
+    generateRestoreShared(offset);
+    
+    // "Manually" restore REG_0, REG_1
+    generateStackLoad(REG_0, offset + sharedSize);
+    generateStackLoad(REG_1, offset + sharedSize + WORD_WIDTH);
+    
+    // We need to branch back to one instruction beyond the branch to the phase 5 slot.
+    generateBranchAlways(instAddr + getInstWidth(), slotBase, getNOP());
+    endCode();
+
+    // TODO: Add assert against against the snippet size.
+}
+
+


Index: llvm/lib/Reoptimizer/Inst/lib/SparcInstManip.h
diff -u llvm/lib/Reoptimizer/Inst/lib/SparcInstManip.h:1.7 llvm/lib/Reoptimizer/Inst/lib/SparcInstManip.h:1.8
--- llvm/lib/Reoptimizer/Inst/lib/SparcInstManip.h:1.7	Fri May  9 23:01:50 2003
+++ llvm/lib/Reoptimizer/Inst/lib/SparcInstManip.h	Mon May 12 21:00:23 2003
@@ -14,6 +14,8 @@
 
 #include "InstManip.h"
 
+#include <map>
+
 class SparcInstManip : public InstManip
 {
   public:
@@ -25,7 +27,7 @@
     virtual void     buildSlot(Phase4Info* p3info,
                                std::vector<unsigned>& snippet);
 
-    virtual void     buildSlot(PrimInfo* pi,
+    virtual void     buildSlot(GBTElem* gbte,
                                uint64_t slotBase,
                                uint64_t instAddr,
                                const std::pair<uint64_t, uint64_t>& extents,
@@ -33,7 +35,7 @@
 
     virtual unsigned getSlotSize(Phase2* p2) const;
     virtual unsigned getSlotSize(Phase3* p3, InstCandidate& cand) const;
-    virtual unsigned getSlotSize(Phase4* p4) const;
+    virtual unsigned getSlotSize(Phase4* p4, pp::GBTEntryType type) const;
 
     virtual void     findCandidates(const std::pair<uint64_t, uint64_t>& range,
                                     std::vector<InstCandidate>& candidates);
@@ -46,7 +48,8 @@
 
   private:
     SparcInstManip() {}
-    typedef std::map<unsigned, unsigned> OutputToInputRegMap;
+    typedef std::map<unsigned, unsigned>               OutputToInputRegMap;
+    typedef std::map<GBTElem*, std::vector<uint64_t> > GBTStackMap;
 
     void             startCode(std::vector<unsigned>& snippet) { m_pCurrSnippet = &snippet; }
     void             endCode()                                 { m_pCurrSnippet = 0;        }
@@ -60,10 +63,7 @@
                                           unsigned delaySlotInstr = NOP_INST);
                      
     void             generateCall(uint64_t dest, uint64_t slotBase);
-                     
-    void             generateSPLoad(LogicalRegister reg, unsigned offset);
-
-    // generateAlloca - Generate code to allocate 'size' bytes on the stack
+    void             generateSPOffset(LogicalRegister reg, unsigned offset);
     void             generateAlloca(unsigned size);
 
     void             generateLoad(uint64_t value,
@@ -72,31 +72,20 @@
                      
     void             generateStackStore(LogicalRegister src, unsigned stkOffset);
     void             generateStackLoad(LogicalRegister dest, unsigned stkOffset);
-                     
     void             generateRestore();
     void             generateSave();
                      
-    // generateRestoreShared - Generate code to un-spill the shared registers from
-    // restoreFromAddr, using tmp1 and tmp2 as temporary registers
-    
     void             generateRestoreShared(uint64_t restoreFromAddr,
                                            LogicalRegister tmp1 = REG_0,
                                            LogicalRegister tmp2 = REG_1);
 
-    // generateRestoreShared - Generate code to un-spill the shared registers from the memory
-    // at %sp + BIAS + offset
     void             generateRestoreShared(unsigned offset);
-                     
-    // generateSpillShared - Generate code to spill the shared registers to spillFromAddr,
-    // using tmp1 and tmp2 as temporary registers
+
     void             generateSpillShared(uint64_t spillFromAddr,
                                          LogicalRegister tmp1 = REG_0,
                                          LogicalRegister tmp2 = REG_1);
 
-    // generateSpillShared - Generate code to spill the shared registers to the memory at
-    // %sp + BIAS + offset
     void             generateSpillShared(unsigned offset);
-                     
     unsigned         getRestoreInst() const;
     inline unsigned  getCallInst(uint64_t dest, uint64_t pc) const;
     inline unsigned  getGenAddressCopySize(unsigned loadInst) const;
@@ -123,9 +112,22 @@
     unsigned        findAllocaOffset(uint64_t instAddr,
                                      const std::pair<uint64_t, uint64_t>& range);
 
+    void            buildStartIntervalSlot(GBTElem* gbte,
+                                           uint64_t slotBase,
+                                           uint64_t instAddr,
+                                           const std::pair<uint64_t, uint64_t>& extents,
+                                           std::vector<unsigned>& snippet);
+    
+    void            buildEndIntervalSlot(GBTElem* gbte,
+                                         uint64_t slotBase,
+                                         uint64_t instAddr,
+                                         const std::pair<uint64_t, uint64_t>& extents,
+                                         std::vector<unsigned>& snippet);
+
     static bool     isAllocaSignature(unsigned inst, unsigned& offset);
 
-    std::vector<unsigned>* m_pCurrSnippet;       
+    std::vector<unsigned>* m_pCurrSnippet;
+    GBTStackMap            m_gbtStackMap;        // Maps GBTElem* -> param address stack
     OutputToInputRegMap    m_outputToInputReg;   // Maps input register -> output register
 
     // Size (in number of 64-bit words) required for storing shared registers
@@ -153,6 +155,7 @@
     static const unsigned WORD_WIDTH =   8;
     static const unsigned PARAM_0 =      128;
     static const unsigned PARAM_1 =      PARAM_0 + 8;
+    static const unsigned PARAM_2 =      PARAM_1 + 8;
 
     // Fixed sizes of generated SparcV9 assembly snippets
     static const unsigned GEN_LOAD_SIZE =          6;
@@ -167,7 +170,7 @@
     static const unsigned GEN_UNSPL_SIZE =         GEN_SPL_SIZE;
     static const unsigned GEN_UNSPL_STK_SIZE =     GEN_SPL_STK_SIZE;
     static const unsigned GEN_ALLOCA_SIZE =        1;
-    static const unsigned GEN_SPLOAD_SIZE =        1;
+    static const unsigned GEN_SPOFFSET_SIZE =      1;
 };
 
 unsigned SparcInstManip::getBranchAlways(uint64_t dest, uint64_t pc, bool annul) const


Index: llvm/lib/Reoptimizer/Inst/lib/design.txt
diff -u llvm/lib/Reoptimizer/Inst/lib/design.txt:1.13 llvm/lib/Reoptimizer/Inst/lib/design.txt:1.14
--- llvm/lib/Reoptimizer/Inst/lib/design.txt:1.13	Sun May  4 16:16:17 2003
+++ llvm/lib/Reoptimizer/Inst/lib/design.txt	Mon May 12 21:00:23 2003
@@ -880,7 +880,6 @@
 
 {{{ MILESTONES
 
-- Perform the "tracecache experiment" described in the TODO section.
 
 }}}
 
@@ -890,8 +889,77 @@
       (do not need variable-sized spill region except for phase5 invocations)
 
     - Start table-of-stacks implementation for phase4 authorship of phase 5 slots.
+      - Placed on hold temporary because of "alloca-finding" approach. However, see the 
+        following e-mail for the current state of things:
 
-    - Write phase 5 slot generation code, phase 5 function itself, etc.
+        {{{ E-mail regarding alloca-finding and table-of-stacks approach
+Okay, this is starting to seem intractable. I have another problem that
+I don't think can be resolved without resorting to a custom-stack
+mechanism that will incur prohibitive overhead.
+
+Everything is working for start-region instrumentation sites.  For
+end-region instrumentation sites, however, there's a problem. In order
+to write the slot for end sites, I have to know (or know how to compute)
+the address of the return value of the corresponding start site. I had
+originally thought that I would just store this in the GBT, or
+"something", but I clearly didn't think through the problem well enough.
+
+There are only two ways I can think of that this can occur:
+
+(a) Write the effective address of the return value of the start inst
+func, so that it gets passed to the end inst func.
+
+or
+
+(b) Somehow encode the stack offset to the return value from the start
+inst, where the offset is from the %sp *at the end-region site*
+
+Both of these have problems.
+
+First, I don't think (b) can work at all, given that there may be
+alloca's present in the original application that would change the %sp,
+and thus the offset value that we'd need, and we can't determine the
+exact allocas that are executed statically.
+
+For (a), the effective address isn't known until runtime. We can store
+this address in some global table where the phase 4 invocation for the
+end site can find it, but it is not sufficient to have a single scalar
+address here -- we must have a stack, due to potential recursive
+invocations. I think that this is clear, please let me know if I'm not
+making sense. :) 
+
+Hence, we'd need to maintain a stack of effective addresses, which was
+pushed during the execution of phase 5 for the start site, and then read
+and popped during the execution of phase 5 for the end site.  We're
+already really bloated with how many instructions we've got going on for
+all of the spills, etc, and I'm concerned about the effect that this
+stack manipulation will have on our overhead, as we talked about before.
+
+The way I see it, we only have two options if we're to make forward
+progress and not obliterate our chances of having lower overhead
+numbers. Hopefully we have some better choices. In the interests of
+short-term forward progress, I'm going to go with #1 for now.
+
+#1 - Make another common-case assumption that there will be no allocas
+between start and end sites, on *any* control path. If this is the case,
+then we know that the stack pointer will not have been manipulated (I
+think) between the start and end sites, and so the %sp offsets to the
+requisite data will be unchanged since when the phase 5 step occurred
+for the start site.  
+
+#2 - Just implement our fall-back solution that everything seems to be
+pointing to. I'm not sure exactly what other logistic nightmares might
+be entailed in this, though, because I've only a sketch of the idea.
+
+I wanted to point out, also, that the so-called "fall back" approach we
+discussed previous also involves manipulation of a stack at runtime
+(push/pop actions still have to occur at runtime), so perhaps the stack
+of effective addresses is less prohibitive than I thought, if only in
+the sense that we cannot avoid it. :(
+        }}}
+
+    - Write phase 5 stuff for end-region sites -- will assume that not allocas lie between
+    the start and end sites, which is not particularly a fair assumption.
 
     - Optimizations:
         - No need to save registers (other than those clobbered) in phase 3 slot, since phase 3