[llvm-commits] CVS: llvm/lib/Reoptimizer/Inst/InstManip.cpp InstManip.h Phases.cpp design.txt
Joel Stanley
jstanley at cs.uiuc.edu
Tue Apr 29 13:31:01 PDT 2003
Changes in directory llvm/lib/Reoptimizer/Inst:
InstManip.cpp updated: 1.7 -> 1.8
InstManip.h updated: 1.8 -> 1.9
Phases.cpp updated: 1.12 -> 1.13
design.txt updated: 1.9 -> 1.10
---
Log message:
Phase3-generated phase 4 slots now spill global registers properly.
---
Diffs of the changes:
Index: llvm/lib/Reoptimizer/Inst/InstManip.cpp
diff -u llvm/lib/Reoptimizer/Inst/InstManip.cpp:1.7 llvm/lib/Reoptimizer/Inst/InstManip.cpp:1.8
--- llvm/lib/Reoptimizer/Inst/InstManip.cpp:1.7 Tue Apr 15 16:26:19 2003
+++ llvm/lib/Reoptimizer/Inst/InstManip.cpp Tue Apr 29 13:36:53 2003
@@ -12,6 +12,7 @@
#include "InstManip.h"
const unsigned InstManip::NOP_INST = 0x01000000;
+uint64_t InstManip::sm_phase3SpillRegion[InstManip::SHARED_SIZE];
using std::cout;
using std::cerr;
@@ -154,16 +155,77 @@
"Unexpected number of instructions in code sequence for call");
}
+// NB: Generate restore/save currently fill the snippet (which comes from a slot) with a
+// bunch of code to save and restore the global registers. This blows up the size of the
+// required slot quite a bit -- it would be better to generate a call to functions
+// saveGlobalRegs() and restoreGlobalRegs(), for example. However, this works for now and
+// writing those functions means determining what the inline assembly should look like.
+// The ifdef'd-out region below is a start, but it is incomplete and generates errors at
+// assembly time. In particular, the SPARC assembly requires a '.register' directive before
+// it witnesses a use of %g2, %g3, %g6, or %g7, and that doesn't appear to be emitted simply
+// by using the inline assembly. :( TODO.
+//
+
+#if 0
+void restoreGlobRegs()
+{
+ // asm ("assembly template" : "output contraints", "input contraints")
+ // Restore the global registers %g[1-7] from the globalRegs array.
+
+ asm("ldx %0, %%g1"::"o" (globalRegs));
+ asm("ldx %0, %%g2"::"o" (globalRegs+1));
+ asm("ldx %0, %%g3"::"o" (globalRegs+2));
+ asm("ldx %0, %%g4"::"o" (globalRegs+3));
+ asm("ldx %0, %%g5"::"o" (globalRegs+4));
+ asm("ldx %0, %%g6"::"o" (globalRegs+5));
+ asm("ldx %0, %%g7"::"o" (globalRegs+6));
+}
+#endif
+
+void InstManip::generateRestoreShared(uint64_t restoreFromAddr,
+ std::vector<unsigned>& snippet,
+ TargetRegister reg) const
+{
+ generateLoad(restoreFromAddr, snippet, reg);
+
+ unsigned destReg = (reg == REG_0) ? R_O0 : R_O1;
+
+ snippet.push_back(MK_LOAD_IMM(R_G1, destReg, 8));
+ snippet.push_back(MK_LOAD_IMM(R_G2, destReg, 16));
+ snippet.push_back(MK_LOAD_IMM(R_G3, destReg, 24));
+ snippet.push_back(MK_LOAD_IMM(R_G4, destReg, 32));
+ snippet.push_back(MK_LOAD_IMM(R_G5, destReg, 40));
+ snippet.push_back(MK_LOAD_IMM(R_G6, destReg, 48));
+ snippet.push_back(MK_LOAD_IMM(R_G7, destReg, 56));
+}
+
void InstManip::generateRestore(std::vector<unsigned>& snippet) const
{
// restore %o0, 0, %o0
- snippet.push_back(MK_RESTORE(R_O0, R_O0, 0));
+ snippet.push_back(MK_RESTORE_IMM(R_O0, R_O0, 0));
+}
+
+void InstManip::generateSpillShared(uint64_t spillToAddr,
+ std::vector<unsigned>& snippet,
+ TargetRegister reg) const
+{
+ generateLoad(spillToAddr, snippet, reg);
+
+ unsigned destReg = (reg == REG_0) ? R_O0 : R_O1;
+
+ snippet.push_back(MK_STORE_IMM(R_G1, destReg, 8));
+ snippet.push_back(MK_STORE_IMM(R_G2, destReg, 16));
+ snippet.push_back(MK_STORE_IMM(R_G3, destReg, 24));
+ snippet.push_back(MK_STORE_IMM(R_G4, destReg, 32));
+ snippet.push_back(MK_STORE_IMM(R_G5, destReg, 40));
+ snippet.push_back(MK_STORE_IMM(R_G6, destReg, 48));
+ snippet.push_back(MK_STORE_IMM(R_G7, destReg, 56));
}
void InstManip::generateSave(std::vector<unsigned>& snippet) const
{
// save %o0, 0, %o0
- snippet.push_back(MK_SAVE(R_O0, R_O0, 0));
+ snippet.push_back(MK_SAVE_IMM(R_O0, R_O0, 0));
}
void InstManip::generateBranchAlways(uint64_t dest,
Index: llvm/lib/Reoptimizer/Inst/InstManip.h
diff -u llvm/lib/Reoptimizer/Inst/InstManip.h:1.8 llvm/lib/Reoptimizer/Inst/InstManip.h:1.9
--- llvm/lib/Reoptimizer/Inst/InstManip.h:1.8 Fri Apr 18 12:29:00 2003
+++ llvm/lib/Reoptimizer/Inst/InstManip.h Tue Apr 29 13:36:53 2003
@@ -43,6 +43,11 @@
return m_insts;
}
+ const std::vector<std::pair<uint64_t, unsigned> >& getInsts() const
+ {
+ return m_insts;
+ }
+
void push_back(uint64_t addr, unsigned inst)
{
m_insts.push_back(std::make_pair(addr, inst));
@@ -103,7 +108,15 @@
void generateRestore(std::vector<unsigned>& snippet) const;
void generateSave(std::vector<unsigned>& snippet) const;
+
+ void generateSpillShared(uint64_t spillFromAddr,
+ std::vector<unsigned>& snippet,
+ TargetRegister reg = REG_0) const;
+ void generateRestoreShared(uint64_t restorFromAddr,
+ std::vector<unsigned>& snippet,
+ TargetRegister reg = REG_0) const;
+
void generateBranchAlways(uint64_t dest,
uint64_t slotBase,
std::vector<unsigned>& snippet,
@@ -120,16 +133,21 @@
// These are functions so when InstManip is superclassed, they'd become virtual, etc.
// In the short term we could use class constants, but this is more clear.
- unsigned getNOP() const { return NOP_INST; }
- unsigned getGenLoadSize() const { return 6; }
- unsigned getGenCallSize() const { return 2; }
- unsigned getGenBranchAlwaysSize() const { return 2; }
- unsigned getGenSaveSize() const { return 1; }
- unsigned getGenRestoreSize() const { return 1; }
- unsigned getInstWidth() const { return 4; }
+ unsigned getNOP() const { return NOP_INST; }
+ unsigned getGenLoadSize() const { return 6; }
+ unsigned getGenCallSize() const { return 2; }
+ unsigned getGenBranchAlwaysSize() const { return 2; }
+ unsigned getGenSaveSize() const { return 1; }
+ unsigned getGenSpillSharedSize() const { return getGenLoadSize() + SHARED_SIZE; }
+ unsigned getGenRestoreSharedSize() const { return getGenLoadSize() + SHARED_SIZE; }
+ unsigned getGenRestoreSize() const { return 1; }
+ unsigned getInstWidth() const { return 4; }
+ unsigned getSharedSize() const { return SHARED_SIZE; }
inline unsigned getAddressCopySize(unsigned loadInst) const;
+ uint64_t getPhase3SpillAddr() { return (uint64_t) sm_phase3SpillRegion; }
+
private:
InstManip() {}
@@ -154,7 +172,18 @@
static const unsigned BRANCH_ALWAYS_BASE = 0x30480000;
static const unsigned NOP_INST;
+ // Size (in number of 64-bit words) required for storing shared registers
+ static const unsigned SHARED_SIZE = 7;
+
VirtualMem* m_pVM;
+
+ // Memory region into which to spill shared registers when executing a phase 4 slot
+ // (i.e., the slot that invokes the phase4 function, the slot written by phase 3
+ // invocations). NB: One region is sufficient and we do not need stack semantics
+ // because only one activation of a phase 4 slot ever occurs at a given time (assuming
+ // single-threaded execution).
+
+ static uint64_t sm_phase3SpillRegion[SHARED_SIZE];
};
void InstManip::printRange(uint64_t start, uint64_t end) const
@@ -208,6 +237,5 @@
return 1;
}
-
#endif // _INCLUDED_INSTMANIP_H
Index: llvm/lib/Reoptimizer/Inst/Phases.cpp
diff -u llvm/lib/Reoptimizer/Inst/Phases.cpp:1.12 llvm/lib/Reoptimizer/Inst/Phases.cpp:1.13
--- llvm/lib/Reoptimizer/Inst/Phases.cpp:1.12 Fri Apr 18 12:29:00 2003
+++ llvm/lib/Reoptimizer/Inst/Phases.cpp Tue Apr 29 13:36:53 2003
@@ -43,11 +43,13 @@
// 3c. In the new slot, write the contents of the phase 3 slot:
// +---------------------------------------+
// | save registers |
+// | save global registers |
// | copy load-src addr to param1 register |
// | load p4 struct ptr to param2 register |
// | call to phase 4 |
// | nop |
// | restore registers |
+// | restore global registers |
// | branch back to orig code |
// | nop |
// +---------------------------------------+
@@ -56,6 +58,26 @@
//
// PHASE 4:
//
+// 1. Examine the tag (i.e. load-src addr) passed by phase 3
+// 1a. If tag is in GBT, we have a valid candidate, so do step 2.
+// 1b. If tag is not in GBT, our candidate is invalid, so delete slot and return to
+// original code.
+//
+// 2. Set up the second phase 4 slot that will actually call the instrumentation function:
+// +---------------------------------------+
+// | save registers |
+// | save global registers |
+// | call to inst func |
+// | nop |
+// | restore registers |
+// | restore global registers |
+// | branch back to orig code |
+// | nop |
+// +---------------------------------------+
+// This "instrumentation slot" may have to be expanded later to store the return value
+// in an alloca'd temporary, unless the phase4 function itself can invoke the
+// instrumentation function, would be *highly* ideal.
+//
#include <stdlib.h>
#include <iostream>
@@ -79,11 +101,13 @@
// obtained in the same manner.
extern unsigned ppGBTSize;
-extern struct PrimInfo {
+typedef struct PrimInfo {
unsigned gbtType;
unsigned short* loadVar;
unsigned gbtStartIdx;
-} ppGBT[];
+};
+
+extern PrimInfo ppGBT[];
typedef std::pair<uint64_t, uint64_t> AddressRange;
@@ -184,8 +208,8 @@
};
// Phase3 is the class that is responsible for making the "phase 3" transformation; the
-// global function phase3() is responsible for constructing a one Phase3 instance per
-// invocation and for deallocating the originating slot.
+// global function phase3() is responsible for constructing one Phase3 instance per
+// invocation and invoking transform on it.
class Phase3
{
@@ -206,6 +230,28 @@
InstManip m_instManip;
};
+// Phase4 is the class that is responsible for making the "phase 4" transformation; the
+// global function phase4() is responsible for constructing one Phase4 instance per
+// invocation and invoking transform on it.
+
+class Phase4
+{
+ public:
+ Phase4(uint64_t tag, Phase4Info* p4info);
+ ~Phase4();
+
+ void transform();
+
+ private:
+ Phase4(): m_instManip(0) {}
+
+ inline unsigned getSlotSize() const;
+
+ Phase4Info* m_pPhase4Info;
+ TraceCache* m_pTraceCache;
+ InstManip m_instManip;
+ uint64_t m_tag; // Entry to look for in the GBT
+};
//////////////// Phase 2 implementation ////////////////
@@ -250,12 +296,12 @@
static void copySnippetToSlot(vector<unsigned>& snippet,
uint64_t slotBase,
VirtualMem* vm,
- InstManip* im)
+ InstManip& im)
{
uint64_t currAddr = slotBase;
for(vector<unsigned>::iterator i = snippet.begin(), e = snippet.end(); i != e; ++i) {
vm->writeInstToVM(currAddr, *i);
- currAddr += im->getInstWidth();
+ currAddr += im.getInstWidth();
}
}
@@ -289,7 +335,7 @@
// Copy the snippet code into the slot
assert(snippet.size() == getSlotSize() && "Snippet size does not match slot size");
- copySnippetToSlot(snippet, slotBase, vm, &m_instManip);
+ copySnippetToSlot(snippet, slotBase, vm, m_instManip);
}
unsigned Phase2::getSlotSize() const
@@ -344,6 +390,21 @@
delete m_pPhase3Info;
}
+static uint64_t replaceInstWithBrToSlot(uint64_t srcAddr,
+ unsigned slotSize,
+ TraceCache* tc,
+ InstManip& im)
+{
+ // Obtain a new slot of the given size
+ uint64_t slotBase = tc->getMemMgr()->getMemory(slotSize);
+ assert(slotBase && "Unable to obtain memory from MemoryManager instance");
+
+ // Replace instruction at srcAddr with branch to start of new slot
+ tc->getVM()->writeInstToVM(srcAddr, im.getBranchAlways(slotBase, srcAddr));
+
+ return slotBase;
+}
+
void Phase3::processCandidates(vector<InstCandidate>& candidates)
{
// For each load candidate, obtain a new slot and write the phase 3 slot region
@@ -352,6 +413,7 @@
for(vector<InstCandidate>::iterator i = candidates.begin(), e = candidates.end(); i != e; ++i) {
cerr << "Transforming " << *i << endl;
+#if 0
uint64_t slotBase = m_pTraceCache->getMemMgr()->getMemory(getSlotSize(*i));
assert(slotBase && "Unable to obtain memory from MemoryManger instance");
@@ -359,6 +421,10 @@
VirtualMem* vm = m_pTraceCache->getVM();
uint64_t loadAddr = i->front().first;
vm->writeInstToVM(loadAddr, m_instManip.getBranchAlways(slotBase, loadAddr));
+#endif
+ // Replace load candidate instruction with a branch to the start of a new slot.
+ uint64_t slotBase = replaceInstWithBrToSlot(i->front().first, getSlotSize(*i),
+ m_pTraceCache, m_instManip);
// Generate a) code to save the registers, b) instruction(s) to store the load
// source address into a phase4 parameter register, c) the load of (the
@@ -368,11 +434,15 @@
Phase4Info* p4info = new Phase4Info(*i, slotBase, getSlotSize(*i), m_pTraceCache);
+ uint64_t spillAddr = m_instManip.getPhase3SpillAddr();
+
vector<unsigned> snippet;
m_instManip.generateSave(snippet);
- m_instManip.generateAddressCopy(i->front().second, snippet);
+ m_instManip.generateAddressCopy(i->front().second, snippet); // Uses InstManip::REG_0, live to call
+ m_instManip.generateSpillShared(spillAddr, snippet, InstManip::REG_1);
m_instManip.generateLoad((uint64_t) p4info, snippet, InstManip::REG_1);
m_instManip.generateCall((uint64_t) &phase4, slotBase, snippet);
+ m_instManip.generateRestoreShared(spillAddr, snippet);
m_instManip.generateRestore(snippet);
m_instManip.generateBranchAlways(i->front().first, slotBase, snippet);
@@ -387,7 +457,7 @@
// Copy the snippet code into the slot
assert(snippet.size() == getSlotSize(*i) && "Snippet size does not match slot size");
- copySnippetToSlot(snippet, slotBase, vm, &m_instManip);
+ copySnippetToSlot(snippet, slotBase, m_pTraceCache->getVM(), m_instManip);
// just one candidate for now
break;
@@ -401,8 +471,10 @@
return m_instManip.getGenSaveSize() +
m_instManip.getAddressCopySize(cand.front().second) +
+ m_instManip.getGenSpillSharedSize() +
m_instManip.getGenLoadSize() +
m_instManip.getGenCallSize() +
+ m_instManip.getGenRestoreSharedSize() +
m_instManip.getGenRestoreSize() +
m_instManip.getGenBranchAlwaysSize();
}
@@ -422,33 +494,120 @@
//////////////// Phase4 implementation ////////////////
-void phase4(uint64_t tag, Phase4Info* p4info)
+void phase4(uint64_t tag, Phase4Info* p4info)
{
- cerr << "phase4 invoked!" << endl;
+ cerr << "phase 4 fcn, tag is " << tag << endl;
+ Phase4 p4(tag, p4info);
+ p4.transform();
+}
- cerr << "tag is " << std::hex << tag << endl;
+Phase4::Phase4(uint64_t tag, Phase4Info* p4info):
+ m_pPhase4Info(p4info),
+ m_pTraceCache(p4info->getTraceCache()),
+ m_instManip(p4info->getTraceCache()->getVM()),
+ m_tag(tag)
+{
+ cerr << "phase4 ctor: tag is " << tag << endl;
+ cerr << "================ Begin Phase 4 ================\n";
+}
- cerr << "inst candidate inside info structure is: " << endl;
- cerr << p4info->getCandidate() << endl;
+Phase4::~Phase4()
+{
+ // Deallocate the originating slot (i.e. the slot that invoked us).
+ //
+ // NB: Yes, we are, in fact, deallocating a memory segment (i.e., the slot obtained by
+ // the TraceCache's MemoryManager instance) before returning to it. This is not a
+ // problem for single-threaded codes, because no threads may claim that memory and
+ // write to it. However, it does indeed pose a problem for multi-threaded codes. A
+ // modification to the general mechanism itself is required to achieve thread-safety.
- // (TEMP) For now, restore the candidate load to its original position for debugging
- // purposes.
+ uint64_t slotBase = m_pPhase4Info->getSlot();
+ unsigned slotSize = m_pPhase4Info->getSlotSize();
+ m_pTraceCache->getMemMgr()->freeTraceMemory(slotBase, slotSize);
- p4info->getTraceCache()->getVM()->writeInstToVM(p4info->getCandidate().front().first,
- p4info->getCandidate().front().second);
- delete p4info;
+ // Deallocate the parameter structure
+ delete m_pPhase4Info;
+}
- cerr << "ppGBT is: " << ppGBT << endl;
- cerr << "ppGBTSize is: " << ppGBTSize << endl;
+static void dumpGBT(std::ostream& ostr)
+{
+ ostr << "ppGBT is: " << ppGBT << endl;
+ ostr << "ppGBTSize is: " << ppGBTSize << endl;
- for(int i = 0; i < ppGBTSize; ++i) {
- cerr << "ppGBT[" << i << "]: " << ppGBT[i].gbtType << ", "
+ for(unsigned i = 0; i < ppGBTSize; ++i) {
+ ostr << "ppGBT[" << i << "]: " << ppGBT[i].gbtType << ", "
<< ppGBT[i].loadVar << ", " << ppGBT[i].gbtStartIdx << endl;
}
-
- // tmp
- if(tag == (uint64_t)(ppGBT[0].loadVar)) {
- cerr << "TAG MATCHES, BOYYYYYYYYYYY!" << endl;
+}
+
+static PrimInfo* searchGBT(uint64_t tag)
+{
+ // Traverse the GBT and determine if the tag is there.
+ for(unsigned i = 0; i < ppGBTSize; ++i) {
+ uint64_t tagInTable = (uint64_t) ppGBT[i].loadVar;
+ if(tagInTable == tag)
+ return &ppGBT[i];
}
- // tmp
+ return 0;
+}
+
+void fakeInstFunc(double* param)
+{
+ cerr << "I AM AN INSTRUMENTATION FUNCTION, FEAR ME!" << endl;
+ *param = 3.14;
+}
+
+void Phase4::transform()
+{
+ cerr << "tag is " << m_tag << endl;
+ dumpGBT(cerr);
+
+ if(PrimInfo* pi = searchGBT(m_tag)) {
+ cerr << "Tag matches." << endl;
+
+ const InstCandidate& cand = m_pPhase4Info->getCandidate();
+#if 0
+ // Make a new slot that calls the instrumentation function, inserting a branch to
+ // it over the original code.
+
+ uint64_t slotBase = replaceInstWithBrToSlot(cand.front().first, getSlotSize(),
+ m_pTraceCache, m_instManip);
+#endif
+
+ // Write NOPs over the original instructions that were associated with the elected
+ // candidate, but leave the branch instruction intact.
+
+ VirtualMem* vm = m_pTraceCache->getVM();
+ for(vector<std::pair<uint64_t, unsigned> >::const_iterator i = cand.getInsts().begin() + 1,
+ e = cand.getInsts().end(); i != e; ++i)
+ vm->writeInstToVM(i->first, m_instManip.getNOP());
+
+ // Write the instructions to call the instrumentation function
+
+ void* instFuncVP = (void*) fakeInstFunc; // From the GBT eventually
+ void (*instFunc)(void*) = (void (*)(void*)) instFuncVP;
+
+ void* mem = malloc(sizeof(double));
+ instFunc(mem);
+ printf("%f\n", *((double*) mem));
+ free(mem);
+ }
+ else {
+ cerr << "Could not find tag" << endl;
+ // The candidate failed to get elected, so pack up and go home. Restore the
+ // replaced instruction (i.e. the branch that invoked this code) with the original
+ // instruction at that location.
+
+ VirtualMem* vm = m_pPhase4Info->getTraceCache()->getVM();
+ vm->writeInstToVM(m_pPhase4Info->getCandidate().front().first,
+ m_pPhase4Info->getCandidate().front().second);
+ }
+
+ // (TEMP) For now, restore the candidate load to its original position for debugging
+ // purposes.
+
+ m_pPhase4Info->getTraceCache()->getVM()->writeInstToVM(m_pPhase4Info->getCandidate().front().first,
+ m_pPhase4Info->getCandidate().front().second);
+
+ cerr << "================ End Phase 4 ================\n";
}
Index: llvm/lib/Reoptimizer/Inst/design.txt
diff -u llvm/lib/Reoptimizer/Inst/design.txt:1.9 llvm/lib/Reoptimizer/Inst/design.txt:1.10
--- llvm/lib/Reoptimizer/Inst/design.txt:1.9 Fri Apr 18 12:29:00 2003
+++ llvm/lib/Reoptimizer/Inst/design.txt Tue Apr 29 13:36:53 2003
@@ -886,12 +886,15 @@
{{{ TODO
- - Investigate trace-cache dummy function mechanisms, decide on approach A or B
- in phase outline
+ - Get phase 2 allocation of spill space working, write spill code (to spill space) for
+ phase 3 invocation. (Currently NO spilling is being done, which is not safe)
- - Implement phase outline
+ - Ensure phase 3 writes proper spill code for phase 4 invocation. (One spill space
+ should be sufficient)
- - Read EEL paper to get a better feel for binary modification issues
+ - Start table-of-stacks implementation for phase4 authorship of phase 5 slots.
+
+ - Write phase 5 slot generation code, phase 5 function itself, etc.
}}}
@@ -1006,13 +1009,10 @@
Approach A:
3e. Write phase 4 code in slot:
- if(actually an instrumentation site)
- rewrite branch at C to next instruction
- call proper instrumentation fnction <- C branches to here
- branch back to C
- else
- restore original instructions
- branch back to C
+
+ Load address being loaded by candidate load instruction.
+ Call phase 4 function
+ branch back to C
Approach B:
@@ -1025,7 +1025,134 @@
restore original instructions
branch back to C
- In phase 4: No special action needed.
+ In phase 4:
+
+ Actions of phase 4 function.
+
+ 1. Check tag to verify GBT membership. If not found in GBT, do nothing besides
+ return to the origial code, etc.
+
+ 2. Assuming tag is valid, we must decide between one of two approaches at this
+ juncture:
+
+ a) Try to invoke the instrumentation function directly from within phase 4.
+ b) Write code in yet another slot that will invoke the inst function.
+
+ The primary problem to solve in both of these approaches is how to allocate space for
+ values that are stored to / read from by the instrumentation function. That is, for
+ point metrics, we must construct the semantic equivalent of a function call like:
+
+ foo = someInstFunc();
+
+ where storage for foo has already been allocated by phase 1 (hence we can store its
+ address in the GBT). However, we know nothing about the *type* of the return value,
+ only its size. We must determine the conventions of the call mechanisms for passing
+ back large (i.e. bigger than a register size) objects by value. We can call
+ someInstFunc easily enough, but we must know how to write the code (using either
+ approach a or b above) to take the return value of the function and store it to the
+ metric variable. Phase 1 can store the address and the size of this variable, so it
+ should be simple enough to take the returned-by-value return value of the
+ instrumentation function and perform a memcpy to the appropriate location. This works
+ fine for point metrics, but the problem is worsened significantly by region metrics,
+ because we must have a temporary value around to store the return value of the start
+ function and pass it by address to the end function...this probably has to be
+ accomplished via heap storage -- we had previous thought that alloca would be
+ sufficient, but I don't think this works (the runtime stack is manipulated in between
+ the time that the alloca'd variable would be stored to and the time that it would be
+ read, in the case of interval metrics.
+
+ Looks like we're going to have to do everything from the standpoint of parameter sizes,
+ memcpy's, and heap-allocated temporaries. The only way I can think of to do this in a
+ straightforward manner is to use the phase 4 function itself to do the call to the inst
+ function and the subsequent memcpy. But then we must compile the call to the inst
+ function, and we don't know what type it returns, because this information is not
+ preserved. If it returns, say, a scalar double, how do we store this value in a
+ temporary and copy it to the metric variable (in the case of point metrics, for
+ example)? The only thing I can think of at present is to change the signature
+ conventions...instead of an instrumentation function returning stuff by value, it is
+ instead passed a pointer parameter. Since we will know the sizes of the types from
+ phase 1, we can always heap-allocated the appropriately-sized parameter and pass this
+ raw pointer in to be used by the function as appropriate...but what kinds of problems
+ can be caused here? This is a good topic for conversation with Vikram...talked to
+ Vikram, and the problem is worse that I had originally thought. Heap allocation isn't
+ really an option because we would have to have one heap alloc/dealloc per interval
+ invocation, which is just too expensive.
+
+ A more accurate assessment of the problem.
+
+ We must have a stack-oriented way of saving temporary values between the start interval
+ function and the end interval function. We had thought that we could do this via
+ alloca (i.e. manipulation of the stack pointer to obtain new space). However, the only
+ way this can occur is if we use the current stack frame. Let's say that we want to
+ allocate n slots (i.e. n * 8 bytes). Then, we would do:
+
+ %sp = %sp + (n * 8)
+ %reg = %sp + B + X
+
+ Where reg is just some register (we must spill/restore it before we clobber it here),
+ and X is the offset from %sp + B to the location on the stack where the newly-alloca'd
+ region is to start. This must be "lower" than any previous allocas but "higher" than
+ the end of the contents starting at %sp + B (B is the bias). According to the SparcV9
+ ABI, the size of X is equal to 128 bytes (for register spills) + 48 bytes (6 outgoing
+ registers, each with extended word size) + Q, where Q is the space required for "extra
+ outgoing arguments", that is, arguments to functions beyond the 6th. Q is equal to the
+ greatest number of parameters of any function call within the function body associated
+ with the stack frame (-6, or 0 if no call has parameter width exceeding 6). For
+ example, if S is the stack frame associated with a particular invocation of the
+ function foo, and foo called some function, bar, that took 10 parameters, and no other
+ function was called by foo that had greater than 10 parameters, Q would exactly equal
+ 4.
+
+ The problem is that, although the value of Q is known at compile-time, determining it
+ during phase 1 is premature (the vendor compiler may arbitarily add arguments to
+ functions, for example), and determining it at runtime (on the assembly code itself) is
+ quite possibly not feasible (indirect functions, no way to really determine what are
+ parameters and what are not, etc). We currently do not have a way to obtain this value,
+ and so an attempt to solve this problem using the alloca approach must be abandoned.
+
+ One easy solution that presents itself is to do heap-based stuff, but this is very
+ inefficient and also quite expensive.
+
+ Idea: Manage a stack on the side. The objection to this is that it involves extra
+ function calls.
+
+ In the meantime, in the interests of making forward progress, can we do anything with
+ heap allocation? Remember that we need a stack region into which we can spill the
+ global & FP registers, as well as the data between start- and end-interval functions.
+
+ A note about saving/restoring the global and FP registers, we know that the slot
+ created by phase 4 executes only once. Hence, it is valid to have phase 3 heap-allocate
+ a region large enough to spill the registers (the spill code would be placed in the
+ slot that calls the phase 4 function) and restore the registers. The phase 4 function
+ would have to deallocate this heap region, which means that the epilogue in the phase 4
+ slot would be restoring the registers from a deleted chunk. Or, a call to free the
+ chunk could be placed in the slot itself. This is really the same problem as slot
+ deallocation in general, and shouldn't be a problem in single-threaded codes. However,
+ we must determine a mechanism by which heap allocation can occur for the register
+ spills, and the use of the allocated regions must correspond on a per-invocation basis
+ (i.e. stack semantics) appropriately. One idea is to use a "one-off" approach -- for
+ example, the phase 4 function would heap-allocate a spill region (R) to be used by the
+ first "real" invocation of the instrumentation (phase 5?). Each phase 5 invocation
+ would spill to and restore from region R, and would allocate a heap region (R') to be
+ used by the next invocation of the instrumentation. Of course, there'd have to be a new
+ slot created that would spill to this new region, etc. The regions (heap and slot)
+ could only be recycled as the call stack was popped. This is so gross I don't think
+ that it is an option. So, spilling the global and FP registers is even more of a
+ problem than the data transfer between the start- and end-function invocations, and I
+ think we have to go back to a global stack approach.
+
+ Phase 4 initially creates a (large) heap region which will act as the global stack. It
+ writes the phase-5 slot to use this address to spill to, and the stack base is held
+ onto somehow. The phase 5 slot spills to the current stack pointer, and invokes the
+ phase 5 function. The phase 5 function will allocate space at stack pointer + regsave
+ size for whatever data needs to pass between the start- and end- functions. A pointer
+ to the start of the storage region for start- function is passed into the function,
+ etc, and the OFFSET FROM THE STACK POINTER is stored in the field in the INTERVAL_START
+ record. The phase 5 slot (after the call to the phase 5 function) restores from the
+ current stack pointer, but *does not change the stack pointer*. The phase 5 invocation
+ would also reallocate the stack space if it detected that more space was needed
+ (important but not vital for the prototype implementation -- it can be "big enough" in
+ the initial implementation). See handwritten notes for more detail.
{{{ Notes on using the total-copy approach in the prototype implementation.
More information about the llvm-commits
mailing list