[llvm-commits] CVS: llvm/lib/Reoptimizer/Inst/InstManip.cpp InstManip.h Phases.cpp

Tue Apr 15 16:24:01 PDT 2003

Changes in directory llvm/lib/Reoptimizer/Inst:

InstManip.cpp updated: 1.6 -> 1.7
InstManip.h updated: 1.6 -> 1.7
Phases.cpp updated: 1.10 -> 1.11

---
Log message:

Load candidate heuristic is implemented, phase 4 being invoked with
register contents being passed as a parameter.


---
Diffs of the changes:

Index: llvm/lib/Reoptimizer/Inst/InstManip.cpp
diff -u llvm/lib/Reoptimizer/Inst/InstManip.cpp:1.6 llvm/lib/Reoptimizer/Inst/InstManip.cpp:1.7

--- llvm/lib/Reoptimizer/Inst/InstManip.cpp:1.6	Fri Apr 11 18:57:07 2003
+++ llvm/lib/Reoptimizer/Inst/InstManip.cpp	Tue Apr 15 16:26:19 2003
@@ -17,6 +17,31 @@
 using std::cerr;
 using std::endl;
 
+std::ostream& operator<<(std::ostream& ostr,
+                         const InstCandidate& cand) 
+{
+    cand.print(ostr);
+    return ostr;
+}
+
+void InstCandidate::print(std::ostream& ostr) const
+{
+    ostr << "InstCandidate {" << endl;
+    ostr << "  type = "
+         << (m_type == DIRECT ? "DIRECT" : "STACK XFER")
+         << endl;
+    ostr << "  Instruction dump (address, inst):" << endl;
+
+    for(std::vector<std::pair<uint64_t, unsigned> >::const_iterator i =
+            m_insts.begin(), e = m_insts.end(); i != e; ++i) {
+        ostr << std::hex << "  (" << i->first << ", " << std::flush;
+        sparc_print(i->second);
+        fflush(stdout);
+        ostr << ")" << endl;
+    }
+    ostr << "}";
+}
+
 void InstManip::printRange(unsigned* start, unsigned* end) const
 {
     // Dumps contents (and corresponding disassembly) of memory range given by range
@@ -93,9 +118,28 @@
            "Unexpected number of instructions in code sequence for 64-bit value -> %destReg");
 }
 
+void InstManip::generateAddressCopy(unsigned loadInst,
+                                    std::vector<unsigned>& snippet,
+                                    TargetRegister reg) const
+{
+    unsigned destReg = (reg == REG_0) ? R_O0 : R_O1;
+    unsigned rs1 = RD_FLD(loadInst, INSTR_RS1);
+    
+    if(RD_FLD(loadInst, INSTR_I)) {
+        // Case 1: load is immediate-valued --> reg, imm value add instruction needed
+        unsigned imm = RD_FLD(loadInst, INSTR_SIMM13);
+        snippet.push_back(MK_ADD_R_I(destReg, rs1, imm));
+    }
+    else {
+        // Case 2: load is register-valued --> reg, reg add instruction needed
+        unsigned rs2 = RD_FLD(loadInst, INSTR_RS2);
+        snippet.push_back(MK_ADD_R_R(destReg, rs1, rs2));
+    }
+}
+
 void InstManip::generateCall(uint64_t dest,
                              uint64_t slotBase,
-                             std::vector<unsigned>& snippet)
+                             std::vector<unsigned>& snippet) const
 {
     unsigned initSize = snippet.size();
     
@@ -110,10 +154,22 @@
            "Unexpected number of instructions in code sequence for call");
 }
 
+void InstManip::generateRestore(std::vector<unsigned>& snippet) const
+{
+    // restore %o0, 0, %o0
+    snippet.push_back(MK_RESTORE(R_O0, R_O0, 0));
+}
+
+void InstManip::generateSave(std::vector<unsigned>& snippet) const
+{
+    // save %o0, 0, %o0
+    snippet.push_back(MK_SAVE(R_O0, R_O0, 0));
+}
+
 void InstManip::generateBranchAlways(uint64_t dest,
                                      uint64_t slotBase,
                                      std::vector<unsigned>& snippet,
-                                     bool annul) 
+                                     bool annul) const
 {
     unsigned initSize = snippet.size();
     
@@ -133,25 +189,31 @@
                                std::vector<InstCandidate>& candidates) 
 {
     for(uint64_t currAddr = start; currAddr <= end; currAddr += getInstWidth()) {
-        unsigned inst = m_pVM->readInstrFrmVm(currAddr);
-        
-        cout << "findCandidates processing instruction:\t";
-        printInst(m_pVM->readInstrFrmVm(currAddr));
-        cout << endl;
-        
         InstCandidate cand;
         if(isCandidateLoad(currAddr, end, cand))
-            cerr << "It's a candidate load!" << endl;
+            candidates.push_back(cand);
     }
 }
 
 static inline bool isLoadHalfWord(unsigned inst)
 {
-    // Returns true if inst is an lduh instruction
+    // Returns true if inst is an LDUH instruction
     return RD_FLD(inst, INSTR_OP) == OP_3 &&
         RD_FLD(inst, INSTR_OP3) == OP3_LDUH;
 }
 
+static inline bool isLoadByte(unsigned inst) 
+{
+    // Returns true if inst is a LDUB instruction
+    return RD_FLD(inst, INSTR_OP) == OP_3 &&
+        RD_FLD(inst, INSTR_OP3) == OP3_LDUB;
+}
+
+static inline bool isFPRelative(unsigned inst) 
+{
+    return RD_FLD(inst, INSTR_RS1) == R_BP && RD_FLD(inst, INSTR_I) == 1;
+}
+
 static inline bool isSTH(unsigned inst) 
 {
     return RD_FLD(inst, INSTR_OP) == OP_3 &&
@@ -168,7 +230,7 @@
 {
     // Assumes that inst is a load instruction, and returns the register ID of its
     // destination operand.
-    
+
     return RD_FLD(inst, INSTR_RD);
 }
 
@@ -176,16 +238,80 @@
 {
     // Assumes that inst is a stb/sth instruction, and returns the register ID of its
     // source operand (by source, we don't mean rs1 or rs2, but rather rd, which specifies
-    // the register which contains the value being stored)
+    // the register which contains the value being stored); 
 
     return RD_FLD(inst, INSTR_RD);
 }
 
+static inline unsigned getFPOffset(unsigned inst) 
+{
+    assert(isFPRelative(inst) && "Expect instruction to be FP-relative");
+    return RD_FLD(inst, INSTR_SIMM13);
+}
+
+bool InstManip::determineSchema(InstCandidate& cand,
+                                uint64_t end,
+                                std::pair<uint64_t, unsigned>& load,
+                                std::pair<uint64_t, unsigned>& store)
+{
+    // inst1 contains the load instruction (the actual candidate). inst2 contains the
+    // corresponding store instruction, which is either STB or STH.  If STB, take actions
+    // for schema 1, and if STH, schema 2.
+    
+    if(isSTB(store.second)) {
+        // Schema 1: "direct" pattern
+        cand.setType(InstCandidate::DIRECT);
+        cand.push_back(load);
+        cand.push_back(store);
+        return true;
+    }
+    else {
+        assert(isSTH(store.second) && "Instruction must be STH");
+
+        // We have potentially discovered an instance of schema 2, but must search
+        // more to determine if this is the case.
+        // 
+        // KIS heuristic concession: The STH given by storeInst *must* be storing to the stack
+        // in an fp-relative manner; if not, we deny the originating load's candidacy.
+                
+        if(isFPRelative(store.second)) {
+            // Search forward until a LDUB from same stack location (+1) as the STH wrote to
+            // is encountered.  The +1 in specified in the FP offset we're searching for is
+            // due to the fact that we stored a half-word but are loading a byte.
+
+            if(uint64_t stkLoadAddr = findNextStackLoad(store.first, end, getFPOffset(store.second) + 1)) {
+                // Last schema-2 search: find the STB instruction that stores from the
+                // LDUB's destination register.
+                        
+                unsigned ldubInst = m_pVM->readInstrFrmVm(stkLoadAddr);
+                uint64_t stbAddr = findNextStore(stkLoadAddr, end, getLoadDest(ldubInst));
+                unsigned stbInst;
+
+                if(stbAddr && isSTB((stbInst = m_pVM->readInstrFrmVm(stbAddr)))) {
+                            
+                    // All of the criteria have been met for Schema 2, the "stack transfer"
+                    // pattern.
+                        
+                    cand.setType(InstCandidate::STACK_XFER);
+                    cand.push_back(load);
+                    cand.push_back(store);
+                    cand.push_back(stkLoadAddr, ldubInst);
+                    cand.push_back(stbAddr, stbInst);
+                    return true;
+                }
+            }
+        }
+    }
+
+    return false;
+}
+
 bool InstManip::isCandidateLoad(uint64_t addr,
                                 uint64_t end,
                                 InstCandidate& cand) 
 {
     // {{{ Description of heuristic
+
     // A candidate load is the first instruction in a sequence (with an arbitrary number
     // of instructions in between elements of this sequence) that is a "signature" for the
     // particular load of a volatile variable which needs to be replaced with a call to an
@@ -214,6 +340,7 @@
     // The current heurstic catches both of these patterns (designated "direct" and "stack
     // transfer" respectively), and will be extended as insufficiencies in the heuristic
     // are revealed.
+
     // }}}
     
     // Address of potential candidate load is given by 'addr', maximum search address is
@@ -223,34 +350,44 @@
     
     if(isLoadHalfWord(inst)) {
         // Search forward until a sth/stb from inst's target register is encountered
-        uint64_t storeAddr = findNextStore(addr, end, getLoadDest(inst));
-        if(!storeAddr)
-            return false; // No store? Can't be a candidate load.
-        
-        // If STB, ... If STH, ...
+        if(uint64_t storeAddr = findNextStore(addr, end, getLoadDest(inst))) {
 
-        unsigned storeInst = m_pVM->readInstrFrmVm(storeAddr);
-        if(isSTH(storeInst)) {
-            cerr << "Discovered sth: " << endl;
-        }
-        else {
-            // STB instruction
-            cerr << "Discovered stb: " << endl;
+            // If STB, take actions for schema 1, otherwise check for schema 2 conditions.
+
+            unsigned storeInst = m_pVM->readInstrFrmVm(storeAddr);
+            std::pair<uint64_t, unsigned> inst1(addr, inst);
+            std::pair<uint64_t, unsigned> inst2(storeAddr, storeInst);
+
+            return determineSchema(cand, end, inst1, inst2);
         }
-        
-        printInst(storeInst);
-        
-        return true;
     }
     
     return false;
 }
 
+uint64_t InstManip::findNextStackLoad(uint64_t addr,
+                                      uint64_t end,
+                                      unsigned fpOffset)
+{
+    // Sweep the range of addresses starting at addr, up to end, looking for a load byte
+    // that is loading from [%fp + fpOffset]. Return the first such instance, or 0 is such
+    // an instance cannot be found.
+
+    for(uint64_t currAddr = addr; currAddr <= end; currAddr += getInstWidth()) {
+        unsigned inst = m_pVM->readInstrFrmVm(currAddr);
+
+        if(isLoadByte(inst) && isFPRelative(inst) && getFPOffset(inst) == fpOffset)
+            return currAddr;
+    }
+    
+    return 0;
+}
+
 uint64_t InstManip::findNextStore(uint64_t addr,
                                   uint64_t end,
                                   unsigned srcReg) 
 {
-    // Sweep the range of addresses starting at addr (up to end) looking for stb or sth
+    // Sweep the range of addresses starting at addr, up to end, looking for stb or sth
     // instructions that are storing _from_ 'fromReg'.  Return the first such instance, or
     // 0 if such an instance cannot be found.
 


Index: llvm/lib/Reoptimizer/Inst/InstManip.h
diff -u llvm/lib/Reoptimizer/Inst/InstManip.h:1.6 llvm/lib/Reoptimizer/Inst/InstManip.h:1.7
--- llvm/lib/Reoptimizer/Inst/InstManip.h:1.6	Fri Apr 11 18:57:07 2003
+++ llvm/lib/Reoptimizer/Inst/InstManip.h	Tue Apr 15 16:26:19 2003
@@ -38,7 +38,7 @@
     bool isDirect() const       { return m_type == DIRECT;     }
     bool isStackXfer() const    { return m_type == STACK_XFER; }
 
-    const std::vector<std::pair<uint64_t, unsigned> >& getInsts() const 
+    std::vector<std::pair<uint64_t, unsigned> >& getInsts()
     {
         return m_insts;
     }
@@ -48,6 +48,18 @@
         m_insts.push_back(std::make_pair(addr, inst));
     }
 
+    void push_back(std::pair<uint64_t, unsigned>& inst) 
+    {
+        m_insts.push_back(inst);
+    }
+
+    const std::pair<uint64_t, unsigned>& front() const
+    {
+        return m_insts.front();
+    }
+
+    void print(std::ostream& ostr) const;
+
   protected:
     CandType m_type;
 
@@ -55,6 +67,8 @@
     std::vector<std::pair<uint64_t, unsigned> > m_insts;
 };
 
+std::ostream& operator<<(std::ostream& ostr, const InstCandidate& cand);
+
 class InstManip 
 {
   public:
@@ -75,18 +89,25 @@
                     
     uint64_t        skipFunctionHdr(uint64_t addr) const;
                     
+    void            generateAddressCopy(unsigned loadInst,
+                                        std::vector<unsigned>& snippet,
+                                        TargetRegister reg = REG_0) const;
+
     void            generateLoad(uint64_t value,
                                  std::vector<unsigned>& snippet,
                                  TargetRegister reg = REG_0) const;
 
     void            generateCall(uint64_t dest,
                                  uint64_t slotBase,
-                                 std::vector<unsigned>& snippet);
+                                 std::vector<unsigned>& snippet) const;
 
+    void            generateRestore(std::vector<unsigned>& snippet) const;
+    void            generateSave(std::vector<unsigned>& snippet) const;
+    
     void            generateBranchAlways(uint64_t dest,
                                          uint64_t slotBase,
                                          std::vector<unsigned>& snippet,
-                                         bool annul = true);
+                                         bool annul = true) const;
 
     void            findCandidates(uint64_t start,
                                    uint64_t end,
@@ -99,11 +120,15 @@
     // These are functions so when InstManip is superclassed, they'd become virtual, etc.
     // In the short term we could use class constants, but this is more clear.
     
-    unsigned        getNOP() const                 { return 0x01000000; }
-    unsigned        getGenLoadSize() const         { return 6;          }
-    unsigned        getGenCallSize() const         { return 2;          }
-    unsigned        getGenBranchAlwaysSize() const { return 2;          }
-    unsigned        getInstWidth() const           { return 4;          }
+    unsigned        getNOP() const                 { return NOP_INST; }
+    unsigned        getGenLoadSize() const         { return 6;        }
+    unsigned        getGenCallSize() const         { return 2;        }
+    unsigned        getGenBranchAlwaysSize() const { return 2;        }
+    unsigned        getGenSaveSize() const         { return 1;        }
+    unsigned        getGenRestoreSize() const      { return 1;        }
+    unsigned        getInstWidth() const           { return 4;        }
+
+    inline unsigned getAddressCopySize(unsigned loadInst) const;
 
   private:
     InstManip() {}
@@ -112,9 +137,18 @@
                                     uint64_t end,
                                     InstCandidate& cand);
 
+    bool            determineSchema(InstCandidate& cand,
+                                    uint64_t end,
+                                    std::pair<uint64_t, unsigned>& load,
+                                    std::pair<uint64_t, unsigned>& store);
+
     uint64_t        findNextStore(uint64_t addr,
                                   uint64_t end,
                                   unsigned srcReg);
+
+    uint64_t        findNextStackLoad(uint64_t addr,
+                                      uint64_t end,
+                                      unsigned fpOffset);
     
     // Branch-always (annul bit high) instruction base (i.e. address not filled in yet)
     static const unsigned BRANCH_ALWAYS_BASE = 0x30480000;
@@ -163,5 +197,17 @@
 {
     return ::isBranchInstr(inst);
 }
+
+unsigned InstManip::getAddressCopySize(unsigned loadInst) const
+{
+    // Determine the number of instructions required to load the address value used by the
+    // load instruction into some register.
+
+    // Case 1: load is immediate-valued --> add-immediate instruction needed, size is 1 inst
+    // Case 2: load is register-valued --> add-registers instruction needed, size is 1 inst
+
+    return 1;
+}
+
 
 #endif // _INCLUDED_INSTMANIP_H


Index: llvm/lib/Reoptimizer/Inst/Phases.cpp
diff -u llvm/lib/Reoptimizer/Inst/Phases.cpp:1.10 llvm/lib/Reoptimizer/Inst/Phases.cpp:1.11
--- llvm/lib/Reoptimizer/Inst/Phases.cpp:1.10	Fri Apr 11 18:57:07 2003
+++ llvm/lib/Reoptimizer/Inst/Phases.cpp	Tue Apr 15 16:26:19 2003
@@ -12,15 +12,21 @@
 //       
 //       2. For each function F (only in text segment preferably), write code to call phase 3.
 //
-//           2a. Replace the first instruction in F with a branch to a new slot in the
-//           dummy function.
+//           2a. Replace the first (replacable) instruction in F with a branch to a new
+//           slot (annulling bit should specify *not* to execute the branch delay slot) in
+//           the dummy function.
+//
+// 	     2b. In the new slot, write the contents of the phase 2 slot:
+//                        +------------------------------+
+//                        | load parameter for phase 3   |
+//                        |       call to phase 3        |
+//                        |            nop               |
+//                        |    branch back to orig code  |
+//                        |            nop               |
+//                        +------------------------------+
+//               where the parameter to phase 3 is a pointer the heap-allocated Phase3Info
+//               instance.
 //
-// 	     2b. At the new slot write the call to phase 3, passing it a pointer to an
-// 	     info structure which contains the original (replaced) instruction, the
-// 	     address range of the function, etc.
-//
-//           2c. At the end of the new slot write the direct branch back to the original
-//           code.
 //
 // PHASE 3:
 //
@@ -32,8 +38,19 @@
 //       2. Analyze the function and determine the load-volatile candidates.
 //
 //       3. For each load-volatile candidate,
-//
-//           3a.
+//         3a. Obtain a new slot in the dummy function.
+//         3b. Replace the load candidate with branch to slot.
+//         3c. In the new slot, write the contents of the phase 3 slot:
+//                  +---------------------------------------+
+//                  |             save registers            |
+//                  | copy load-src addr to param1 register |
+//                  | load p4 struct ptr to param2 register |
+//                  |             call to phase 4           |
+//                  |                  nop                  |
+//                  |             restore registers         |
+//                  |          branch back to orig code     |
+//                  |                  nop                  |
+//                  +---------------------------------------+
 //
 //       4. Deallocate the slot that originated this invocation.
 //
@@ -66,11 +83,13 @@
                unsigned origInst,
                uint64_t replaceAddr,
                uint64_t slotDescriptor,
+               unsigned slotSize,
                TraceCache* pTraceCache):
         m_addrRange(addressRange),
         m_origInst(origInst),
         m_replaceAddr(replaceAddr),
         m_slotDescriptor(slotDescriptor),
+        m_slotSize(slotSize),
         m_pTraceCache(pTraceCache)
     {
     }
@@ -89,7 +108,8 @@
     uint64_t    getEndAddr() const     { return m_addrRange.second; }
     uint64_t    getOrigInst() const    { return m_origInst;         }
     uint64_t    getReplaceAddr() const { return m_replaceAddr;      }  
-    uint64_t    getSlot() const        { return m_slotDescriptor;   }   
+    uint64_t    getSlot() const        { return m_slotDescriptor;   }
+    uint64_t    getSlotSize() const    { return m_slotSize;         }
     TraceCache* getTraceCache()        { return m_pTraceCache;      }   
 
   private:
@@ -99,10 +119,40 @@
     unsigned     m_origInst;       // Instruction replaced by phase 2
     uint64_t     m_replaceAddr;    // Address at which to restore original inst
     uint64_t     m_slotDescriptor; // Slot created by phase 2
+    unsigned     m_slotSize;       // Size of slot created by phase 2
     TraceCache*  m_pTraceCache;    // TraceCache instance used by phase 2
 };
 
+class Phase4Info
+{
+  public:
+    Phase4Info(const InstCandidate& candidate,
+               uint64_t slotDescriptor,
+               uint64_t slotSize,
+               TraceCache* pTraceCache):
+        m_candidate(candidate),
+        m_slotDescriptor(slotDescriptor),
+        m_slotSize(slotSize),
+        m_pTraceCache(pTraceCache)
+    {
+    }
+
+    const InstCandidate& getCandidate() const { return m_candidate;        }
+    uint64_t             getSlot() const      { return m_slotDescriptor;   }
+    uint64_t             getSlotSize() const  { return m_slotSize;         }
+    TraceCache*          getTraceCache()      { return m_pTraceCache;      }   
+
+  private:
+    Phase4Info() {}
+
+    InstCandidate m_candidate;      // Candidate responsible for this instance's creation
+    uint64_t      m_slotDescriptor; // Slot created by phase 3
+    unsigned      m_slotSize;       // Size of slot created by phase 3
+    TraceCache*   m_pTraceCache;    // TraceCache instance used by phases 2 and 3
+};
+
 void phase3(Phase3Info* p3info);
+void phase4(uint64_t tag, Phase4Info* p4info);
 
 // Phase2 is the class that is responsible for effecting the core of the phase 2
 // transformation; the global function phase2() is simply an C-linkage interface.  
@@ -130,16 +180,19 @@
 {
   public:
     Phase3(Phase3Info* p3info);
+    ~Phase3();
+
     void transform();
 
   private:
     Phase3(): m_instManip(0) {}
 
-    uint64_t    m_startAddr;
-    uint64_t    m_endAddr;
+    void            processCandidates(vector<InstCandidate>& candidates);
+    inline unsigned getSlotSize(InstCandidate&) const;
+
+    Phase3Info* m_pPhase3Info;
     TraceCache* m_pTraceCache;
     InstManip   m_instManip;
-    uint64_t    m_slotDescriptor;
 };
 
 
@@ -197,21 +250,6 @@
 
 void Phase2::transformFunction(AddressRange& range)
 {
-    ////////////////
-    // 1. Replace the first (replacable) instruction in F with a branch to a new slot
-    // (annulling bit should specify *not* to execute the branch delay slot) in the dummy
-    // function.
-    //
-    // 2. In the slot, write:
-    //
-    //   - The code to load the pointer to the heap-allocated Phase3Info instance.
-    // 
-    //   - The call to phase 3
-    //
-    //   - The branch back to the location of the replaced instruction (phase 3 will
-    //   replace the instruction at runtime).
-    //
-
     // Obtain address of first replacable instruction in function and obtain a new slot from
     // the TraceCache memory manager (i.e., a new slot in the dummy function).
     
@@ -230,7 +268,8 @@
     // register, which will be used as a parameter to the phase3 call, b) the call to
     // phase 3 itself, and c) the direct branch back to the original code.
 
-    Phase3Info* p3info = new Phase3Info(range, origInst, repInstAddr, slotBase, m_pTraceCache);
+    Phase3Info* p3info = new Phase3Info(range, origInst, repInstAddr,
+                                        slotBase, getSlotSize(), m_pTraceCache);
 
     vector<unsigned> snippet;
     m_instManip.generateLoad((uint64_t) p3info, snippet);
@@ -244,14 +283,8 @@
 
 unsigned Phase2::getSlotSize() const
 {
-    // A slot used by phase 2 looks like:
-    // +------------------------------+
-    // | load parameter for phase 3   |
-    // |       call to phase 3        |
-    // |            nop               |
-    // |    branch back to orig code  |
-    // |            nop               |
-    // +------------------------------+
+    // The following sum corresponds to the sizes consumed by the various regions of the
+    // phase 2 slot.  See picture of phase 2 contents for details.
 
     return m_instManip.getGenLoadSize() +
         m_instManip.getGenCallSize() +
@@ -263,10 +296,27 @@
 void phase3(Phase3Info* p3info)
 {
     Phase3 p3(p3info);
-    p3info = 0;
-
     p3.transform();
+}
+
+Phase3::Phase3(Phase3Info* p3info):
+    m_pPhase3Info(p3info),
+    m_pTraceCache(p3info->getTraceCache()),
+    m_instManip(p3info->getTraceCache()->getVM())
+{
+    cerr << "================ Begin Phase 3 [" << std::hex
+         << m_pPhase3Info->getStartAddr() << ", " << m_pPhase3Info->getEndAddr()
+         << "] ================\n";
+
+    // 1. Replace the original (replaced) instruction at the proper location in the
+    // original code (thus effectively removing the branch to the slot created by phase 2
+    // as well).
+
+    m_pTraceCache->getVM()->writeInstToVM(p3info->getReplaceAddr(), p3info->getOrigInst());
+}
 
+Phase3::~Phase3() 
+{
     // Deallocate the originating slot (i.e. the slot that invoked us).
     // 
     // NB: Yes, we are, in fact, deallocating a memory segment (i.e., the slot obtained by
@@ -275,36 +325,107 @@
     // write to it.  However, it does indeed pose a problem for multi-threaded codes.  A
     // modification to the general mechanism itself is required to achieve thread-safety.
 
-    // (TODO)
+    cerr << "About to deallocate phase2-created slot" << endl;
+
+    uint64_t slotBase = m_pPhase3Info->getSlot();
+    unsigned slotSize = m_pPhase3Info->getSlotSize();
+    m_pTraceCache->getMemMgr()->freeTraceMemory(slotBase, slotSize);
+
+    // Deallocate the parameter structure
+    delete m_pPhase3Info;
 }
 
-Phase3::Phase3(Phase3Info* p3info):
-    m_instManip(p3info->getTraceCache()->getVM())
+void Phase3::processCandidates(vector<InstCandidate>& candidates) 
 {
-    assert(p3info && "phase3 requires valid Phase3Info ptr");
+    // For each load candidate, obtain a new slot and write the phase 3 slot region
+    // contents into it.  See diagram in comments at top of file for more info.
 
-    m_startAddr = p3info->getStartAddr();
-    m_endAddr = p3info->getEndAddr();
-    m_pTraceCache = p3info->getTraceCache();
-    m_slotDescriptor = p3info->getSlot();
-    
-    cerr << "================ Begin Phase 3 [" << std::hex
-         << m_startAddr << ", " << m_endAddr
-         << "] ================\n";
+    for(vector<InstCandidate>::iterator i = candidates.begin(), e = candidates.end(); i != e; ++i) {
+        cerr << "Transforming " << *i << endl;
 
-    // Restore the replaced instruction to its original location (thus effectively
-    // removing the branch to the slot created by phase 2 as well)
-    m_pTraceCache->getVM()->writeInstToVM(p3info->getReplaceAddr(), p3info->getOrigInst());
+        uint64_t slotBase = m_pTraceCache->getMemMgr()->getMemory(getSlotSize(*i));
+        assert(slotBase && "Unable to obtain memory from MemoryManger instance");
+
+        // Replace load candidate instruction with a branch to start of slot.
+        VirtualMem* vm = m_pTraceCache->getVM();
+        uint64_t loadAddr = i->front().first;
+        vm->writeInstToVM(loadAddr, m_instManip.getBranchAlways(slotBase, loadAddr));
+
+        // Generate a) code to save the registers, b) instruction(s) to store the load
+        // source address into a phase4 parameter register, c) the load of (the
+        // pointer-to) the heap-allocated Phase4Info structure into a phase4 parameter
+        // register, and d) code to call phase 3, restore regs, and branch back to
+        // original code.
+
+        Phase4Info* p4info = new Phase4Info(*i, slotBase, getSlotSize(*i), m_pTraceCache);
+
+        vector<unsigned> snippet;
+        m_instManip.generateSave(snippet);
+        m_instManip.generateAddressCopy(i->front().second, snippet);
+        m_instManip.generateLoad((uint64_t) p4info, snippet, InstManip::REG_1);
+        m_instManip.generateCall((uint64_t) &phase4, slotBase, snippet);
+        m_instManip.generateRestore(snippet);
+        m_instManip.generateBranchAlways(i->front().first, slotBase, snippet);
+
+        // Dump snippet instructions:
+
+        cerr << "phase4 slot instructions:" << endl;
+        
+        for(vector<unsigned>::iterator j = snippet.begin(), k = snippet.end(); j != k; ++j) {
+            m_instManip.printInst(*j);
+            cerr << endl;
+        }
 
-    // Deallocate the parameter structure
-    delete p3info;
+        // Copy the snippet code into the slot
+        assert(snippet.size() == getSlotSize(*i) && "Snippet size does not match slot size");
+        copySnippetToSlot(snippet, slotBase, vm, &m_instManip);
+
+        // just one candidate for now
+        break;
+    }
+}
+
+unsigned Phase3::getSlotSize(InstCandidate& cand) const
+{
+    // The following sum corresponds to the sizes consumed by the various regions of the
+    // phase 3 slot.  See picture of phase 3 contents for details.
+
+    return m_instManip.getGenSaveSize() +
+        m_instManip.getAddressCopySize(cand.front().second) +
+        m_instManip.getGenLoadSize() +
+        m_instManip.getGenCallSize() +
+        m_instManip.getGenRestoreSize() +
+        m_instManip.getGenBranchAlwaysSize();
 }
 
 void Phase3::transform()
 {
-    // Gather up the instruction candidates within the function we to transform.
+    // 2. Analyze the function and determine the load-volatile candidates...
     vector<InstCandidate> candidates;
-    m_instManip.findCandidates(m_startAddr, m_endAddr, candidates);
+    m_instManip.findCandidates(m_pPhase3Info->getStartAddr(),
+                               m_pPhase3Info->getEndAddr(),
+                               candidates);
 
+    // ...and process them
+    processCandidates(candidates);
     cerr << "============================== End Phase 3 ==============================\n";
+}
+
+//////////////// Phase4 implementation ////////////////
+
+void phase4(uint64_t tag, Phase4Info* p4info) 
+{
+    cerr << "phase4 invoked!" << endl;
+
+    cerr << "tag is " << std::hex << tag << endl;
+
+    cerr << "inst candidate inside info structure is: " << endl;
+    cerr << p4info->getCandidate() << endl;
+
+    // (TEMP) For now, restore the candidate load to its original position for debugging
+    // purposes.
+
+    p4info->getTraceCache()->getVM()->writeInstToVM(p4info->getCandidate().front().first,
+                                                    p4info->getCandidate().front().second);
+    delete p4info;
 }