[llvm-commits] CVS: llvm/lib/Target/X86/InstSelectSimple.cpp

Mon Jan 6 12:49:01 PST 2003

Changes in directory llvm/lib/Target/X86:

InstSelectSimple.cpp updated: 1.80 -> 1.81

---
Log message:

* Adjust to use new interfaces, eliminating CurReg stuff
* Support arbitrary FP constants
* Fix bugs in frame layout for function calls and incoming arguments
* Insert copies for constant arguments to PHI nodes into the BOTTOM of
  predecessor blocks, not the top.
* Implement _floating point_ support: setcc, return, load, store, cast
* Fix several bugs in the cast instruction



---
Diffs of the changes:

Index: llvm/lib/Target/X86/InstSelectSimple.cpp
diff -u llvm/lib/Target/X86/InstSelectSimple.cpp:1.80 llvm/lib/Target/X86/InstSelectSimple.cpp:1.81

--- llvm/lib/Target/X86/InstSelectSimple.cpp:1.80	Sat Dec 28 15:08:27 2002
+++ llvm/lib/Target/X86/InstSelectSimple.cpp	Mon Jan  6 12:47:54 2003
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/SSARegMap.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/InstVisitor.h"
 #include "llvm/Target/MRegisterInfo.h"
@@ -59,14 +60,12 @@
     MachineFunction *F;                    // The function we are compiling into
     MachineBasicBlock *BB;                 // The current MBB we are compiling
 
-    unsigned CurReg;
     std::map<Value*, unsigned> RegMap;  // Mapping between Val's and SSA Regs
 
     // MBBMap - Mapping between LLVM BB -> Machine BB
     std::map<const BasicBlock*, MachineBasicBlock*> MBBMap;
 
-    ISel(TargetMachine &tm)
-      : TM(tm), F(0), BB(0), CurReg(MRegisterInfo::FirstVirtualRegister) {}
+    ISel(TargetMachine &tm) : TM(tm), F(0), BB(0) {}
 
     /// runOnFunction - Top level implementation of instruction selection for
     /// the entire function.
@@ -89,7 +88,6 @@
 
       RegMap.clear();
       MBBMap.clear();
-      CurReg = MRegisterInfo::FirstVirtualRegister;
       F = 0;
       return false;  // We never modify the LLVM itself.
     }
@@ -155,7 +153,9 @@
     void visitSetGE(SetCondInst &I) { visitSetCCInst(I, 5); }
 
     // Memory Instructions
+    MachineInstr *doFPLoad(const Type *Ty, unsigned DestReg);
     void visitLoadInst(LoadInst &I);
+    void doFPStore(const Type *Ty, unsigned DestAddrReg, unsigned SrcReg);
     void visitStoreInst(StoreInst &I);
     void visitGetElementPtrInst(GetElementPtrInst &I);
     void visitAllocaInst(AllocaInst &I);
@@ -198,8 +198,7 @@
       // Add the mapping of regnumber => reg class to MachineFunction
       const TargetRegisterClass *RC =
 	TM.getRegisterInfo()->getRegClassForType(Ty);
-      F->getSSARegMap()->addRegMap(CurReg, RC);
-      return CurReg++;
+      return F->getSSARegMap()->createVirtualRegister(RC);
     }
 
     /// getReg - This method turns an LLVM value into a register number.  This
@@ -316,8 +315,10 @@
     else if (Value == +1.0)
       BMI(MBB, IP, X86::FLD1, 0, R);
     else {
-      std::cerr << "Cannot load constant '" << Value << "'!\n";
-      assert(0);
+      // Otherwise we need to spill the constant to memory...
+      MachineConstantPool *CP = F->getConstantPool();
+      unsigned CPI = CP->getConstantPoolIndex(CFP);
+      addConstantPoolReference(doFPLoad(CFP->getType(), R), CPI);
     }
 
   } else if (isa<ConstantPointerNull>(C)) {
@@ -344,15 +345,13 @@
   // [ESP + 8] -- second argument, if four bytes in size
   //    ... 
   //
-  unsigned ArgOffset = 0;
+  unsigned ArgOffset = 4;
   MachineFrameInfo *MFI = F->getFrameInfo();
 
   for (Function::aiterator I = Fn.abegin(), E = Fn.aend(); I != E; ++I) {
     unsigned Reg = getReg(*I);
     
-    ArgOffset += 4;  // Each argument takes at least 4 bytes on the stack...
     int FI;          // Frame object index
-
     switch (getClassB(I->getType())) {
     case cByte:
       FI = MFI->CreateFixedObject(1, ArgOffset);
@@ -373,14 +372,15 @@
 	FI = MFI->CreateFixedObject(4, ArgOffset);
       } else {
 	Opcode = X86::FLDr64;
-	ArgOffset += 4;   // doubles require 4 additional bytes
 	FI = MFI->CreateFixedObject(8, ArgOffset);
+	ArgOffset += 4;   // doubles require 4 additional bytes
       }
       addFrameReference(BuildMI(BB, Opcode, 4, Reg), FI);
       break;
     default:
       assert(0 && "Unhandled argument type!");
     }
+    ArgOffset += 4;  // Each argument takes at least 4 bytes on the stack...
   }
 }
 
@@ -390,6 +390,7 @@
 /// the current one.
 ///
 void ISel::SelectPHINodes() {
+  const MachineInstrInfo &MII = TM.getInstrInfo();
   const Function &LF = *F->getFunction();  // The LLVM function...
   for (Function::const_iterator I = LF.begin(), E = LF.end(); I != E; ++I) {
     const BasicBlock *BB = I;
@@ -410,11 +411,10 @@
         // available in a virtual register, insert the computation code into
         // PredMBB
         //
-	// FIXME: This should insert the code into the BOTTOM of the block, not
-	// the top of the block.  This just makes for huge live ranges...
-        MachineBasicBlock::iterator PI = PredMBB->begin();
-        while ((*PI)->getOpcode() == X86::PHI) ++PI;
-        
+	MachineBasicBlock::iterator PI = PredMBB->end();
+	while (PI != PredMBB->begin() &&
+	       MII.isTerminatorInstr((*(PI-1))->getOpcode()))
+	  --PI;
         MI->addRegOperand(getReg(PN->getIncomingValue(i), PredMBB, PI));
         MI->addMachineBasicBlockOperand(PredMBB);
       }
@@ -433,6 +433,7 @@
 void ISel::visitSetCCInst(SetCondInst &I, unsigned OpNum) {
   // The arguments are already supposed to be of the same type.
   const Type *CompTy = I.getOperand(0)->getType();
+  bool isSigned = CompTy->isSigned();
   unsigned reg1 = getReg(I.getOperand(0));
   unsigned reg2 = getReg(I.getOperand(1));
 
@@ -442,44 +443,26 @@
     // compare 8-bit with 8-bit, 16-bit with 16-bit, 32-bit with
     // 32-bit.
   case cByte:
-    BuildMI (BB, X86::CMPrr8, 2).addReg (reg1).addReg (reg2);
+    BuildMI(BB, X86::CMPrr8, 2).addReg(reg1).addReg(reg2);
     break;
   case cShort:
-    BuildMI (BB, X86::CMPrr16, 2).addReg (reg1).addReg (reg2);
+    BuildMI(BB, X86::CMPrr16, 2).addReg(reg1).addReg(reg2);
     break;
   case cInt:
-    BuildMI (BB, X86::CMPrr32, 2).addReg (reg1).addReg (reg2);
+    BuildMI(BB, X86::CMPrr32, 2).addReg(reg1).addReg(reg2);
     break;
-
-#if 0
-    // Push the variables on the stack with fldl opcodes.
-    // FIXME: assuming var1, var2 are in memory, if not, spill to
-    // stack first
-  case cFP:  // Floats
-    BuildMI (BB, X86::FLDr32, 1).addReg (reg1);
-    BuildMI (BB, X86::FLDr32, 1).addReg (reg2);
+  case cFP:
+    BuildMI(BB, X86::FpUCOM, 2).addReg(reg1).addReg(reg2);
+    BuildMI(BB, X86::FNSTSWr8, 0);
+    BuildMI(BB, X86::SAHF, 1);
+    isSigned = false;   // Compare with unsigned operators
     break;
-  case cFP (doubles):  // Doubles
-    BuildMI (BB, X86::FLDr64, 1).addReg (reg1);
-    BuildMI (BB, X86::FLDr64, 1).addReg (reg2);
-    break;
-#endif
+
   case cLong:
   default:
     visitInstruction(I);
   }
 
-#if 0
-  if (CompTy->isFloatingPoint()) {
-    // (Non-trapping) compare and pop twice.
-    BuildMI (BB, X86::FUCOMPP, 0);
-    // Move fp status word (concodes) to ax.
-    BuildMI (BB, X86::FNSTSWr8, 1, X86::AX);
-    // Load real concodes from ax.
-    BuildMI (BB, X86::SAHF, 1).addReg(X86::AH);
-  }
-#endif
-
   // Emit setOp instruction (extract concode; clobbers ax),
   // using the following mapping:
   // LLVM  -> X86 signed  X86 unsigned
@@ -496,12 +479,12 @@
     {X86::SETEr, X86::SETNEr, X86::SETLr, X86::SETGr, X86::SETLEr, X86::SETGEr},
   };
 
-  BuildMI(BB, OpcodeTab[CompTy->isSigned()][OpNum], 0, getReg(I));
+  BuildMI(BB, OpcodeTab[isSigned][OpNum], 0, getReg(I));
 }
 
 /// promote32 - Emit instructions to turn a narrow operand into a 32-bit-wide
 /// operand, in the specified target register.
-void ISel::promote32 (unsigned targetReg, Value *v) {
+void ISel::promote32(unsigned targetReg, Value *v) {
   unsigned vReg = getReg(v);
   bool isUnsigned = v->getType()->isUnsigned();
   switch (getClass(v->getType())) {
@@ -539,7 +522,7 @@
 ///   ret long, ulong  : Move value into EAX/EDX and return
 ///   ret float/double : Top of FP stack
 ///
-void ISel::visitReturnInst (ReturnInst &I) {
+void ISel::visitReturnInst(ReturnInst &I) {
   if (I.getNumOperands() == 0) {
     BuildMI(BB, X86::RET, 0); // Just emit a 'ret' instruction
     return;
@@ -553,13 +536,13 @@
     promote32(X86::EAX, RetVal);
     break;
   case cFP:                   // Floats & Doubles: Return in ST(0)
-    BuildMI(BB, X86::FpMOV, 1, X86::ST0).addReg(getReg(RetVal));
+    BuildMI(BB, X86::FpSETRESULT, 1).addReg(getReg(RetVal));
     break;
   case cLong:
     // ret long: use EAX(least significant 32 bits)/EDX (most
     // significant 32)...
   default:
-    visitInstruction (I);
+    visitInstruction(I);
   }
   // Emit a 'ret' instruction
   BuildMI(BB, X86::RET, 0);
@@ -595,11 +578,9 @@
     for (unsigned i = 1, e = CI.getNumOperands(); i != e; ++i)
       switch (getClass(CI.getOperand(i)->getType())) {
       case cByte: case cShort: case cInt:
-	NumBytes += 4;
-	break;
+	NumBytes += 4; break;
       case cLong:
-	NumBytes += 8;
-	break;
+	NumBytes += 8; break;
       case cFP:
 	NumBytes += CI.getOperand(i)->getType() == Type::FloatTy ? 4 : 8;
 	break;
@@ -623,29 +604,32 @@
 		     X86::ESP, ArgOffset).addReg(R);
 	break;
       }
-      case cInt:
+      case cInt: {
+        unsigned ArgReg = getReg(Arg);
 	addRegOffset(BuildMI(BB, X86::MOVrm32, 5),
-		     X86::ESP, ArgOffset).addReg(getReg(Arg));
+                     X86::ESP, ArgOffset).addReg(ArgReg);
 	break;
+      }
 
-      case cFP:
+      case cFP: {
+        unsigned ArgReg = getReg(Arg);
 	if (Arg->getType() == Type::FloatTy) {
 	  addRegOffset(BuildMI(BB, X86::FSTr32, 5),
-		       X86::ESP, ArgOffset).addReg(getReg(Arg));
+		       X86::ESP, ArgOffset).addReg(ArgReg);
 	} else {
 	  assert(Arg->getType() == Type::DoubleTy && "Unknown FP type!");
-	  ArgOffset += 4;
-	  addRegOffset(BuildMI(BB, X86::FSTr32, 5),
-		       X86::ESP, ArgOffset).addReg(getReg(Arg));
+	  addRegOffset(BuildMI(BB, X86::FSTr64, 5),
+		       X86::ESP, ArgOffset).addReg(ArgReg);
+	  ArgOffset += 4;       // 8 byte entry, not 4.
 	}
 	break;
-
+      }
       default:
-	// FIXME: long/ulong/float/double args not handled.
+	// FIXME: long/ulong args not handled.
 	visitInstruction(CI);
 	break;
       }
-      ArgOffset += 4;
+      ArgOffset += 4;   // All arguments are at least 4 bytes
     }
   }
 
@@ -657,7 +641,8 @@
     BuildMI(BB, X86::CALLr32, 1).addReg(Reg);
   }
 
-  BuildMI(BB, X86::ADJCALLSTACKUP, 1).addZImm(NumBytes);
+  if (NumBytes)
+    BuildMI(BB, X86::ADJCALLSTACKUP, 1).addZImm(NumBytes);
 
   // If there is a return value, scavenge the result from the location the call
   // leaves it in...
@@ -679,7 +664,7 @@
       break;
     }
     case cFP:     // Floating-point return values live in %ST(0)
-      BuildMI(BB, X86::FpMOV, 1, getReg(CI)).addReg(X86::ST0);
+      BuildMI(BB, X86::FpGETRESULT, 1, getReg(CI));
       break;
     default:
       std::cerr << "Cannot get return value for call of type '"
@@ -783,7 +768,7 @@
     if (I.getOpcode() == Instruction::Div)
       BuildMI(BB, X86::FpDIV, 2, ResultReg).addReg(Op0Reg).addReg(Op1Reg);
     else
-      BuildMI(BB, X86::FpREM, 2, ResultReg).addReg(Op0Reg).addReg(Op1Reg);
+      visitInstruction(I);
     return;
   default:
   case cLong:
@@ -846,50 +831,94 @@
   if (OperandClass > cInt)
     visitInstruction(I); // Can't handle longs yet!
 
-  if (ConstantUInt *CUI = dyn_cast<ConstantUInt> (I.getOperand (1)))
-    {
-      // The shift amount is constant, guaranteed to be a ubyte. Get its value.
-      assert(CUI->getType() == Type::UByteTy && "Shift amount not a ubyte?");
-      unsigned char shAmt = CUI->getValue();
-
-      static const unsigned ConstantOperand[][4] = {
-        { X86::SHRir8, X86::SHRir16, X86::SHRir32, 0 },  // SHR
-        { X86::SARir8, X86::SARir16, X86::SARir32, 0 },  // SAR
-        { X86::SHLir8, X86::SHLir16, X86::SHLir32, 0 },  // SHL
-        { X86::SHLir8, X86::SHLir16, X86::SHLir32, 0 },  // SAL = SHL
-      };
+  if (ConstantUInt *CUI = dyn_cast<ConstantUInt>(I.getOperand(1))) {
+    // The shift amount is constant, guaranteed to be a ubyte. Get its value.
+    assert(CUI->getType() == Type::UByteTy && "Shift amount not a ubyte?");
+    unsigned char shAmt = CUI->getValue();
+
+    static const unsigned ConstantOperand[][4] = {
+      { X86::SHRir8, X86::SHRir16, X86::SHRir32, 0 },  // SHR
+      { X86::SARir8, X86::SARir16, X86::SARir32, 0 },  // SAR
+      { X86::SHLir8, X86::SHLir16, X86::SHLir32, 0 },  // SHL
+      { X86::SHLir8, X86::SHLir16, X86::SHLir32, 0 },  // SAL = SHL
+    };
 
-      const unsigned *OpTab = // Figure out the operand table to use
-        ConstantOperand[isLeftShift*2+isOperandSigned];
+    const unsigned *OpTab = // Figure out the operand table to use
+      ConstantOperand[isLeftShift*2+isOperandSigned];
 
-      // Emit: <insn> reg, shamt  (shift-by-immediate opcode "ir" form.)
-      BuildMI(BB, OpTab[OperandClass], 2, DestReg).addReg(Op0r).addZImm(shAmt);
-    }
-  else
-    {
-      // The shift amount is non-constant.
-      //
-      // In fact, you can only shift with a variable shift amount if
-      // that amount is already in the CL register, so we have to put it
-      // there first.
-      //
+    // Emit: <insn> reg, shamt  (shift-by-immediate opcode "ir" form.)
+    BuildMI(BB, OpTab[OperandClass], 2, DestReg).addReg(Op0r).addZImm(shAmt);
+  } else {
+    // The shift amount is non-constant.
+    //
+    // In fact, you can only shift with a variable shift amount if
+    // that amount is already in the CL register, so we have to put it
+    // there first.
+    //
 
-      // Emit: move cl, shiftAmount (put the shift amount in CL.)
-      BuildMI(BB, X86::MOVrr8, 1, X86::CL).addReg(getReg(I.getOperand(1)));
+    // Emit: move cl, shiftAmount (put the shift amount in CL.)
+    BuildMI(BB, X86::MOVrr8, 1, X86::CL).addReg(getReg(I.getOperand(1)));
 
-      // This is a shift right (SHR).
-      static const unsigned NonConstantOperand[][4] = {
-        { X86::SHRrr8, X86::SHRrr16, X86::SHRrr32, 0 },  // SHR
-        { X86::SARrr8, X86::SARrr16, X86::SARrr32, 0 },  // SAR
-        { X86::SHLrr8, X86::SHLrr16, X86::SHLrr32, 0 },  // SHL
-        { X86::SHLrr8, X86::SHLrr16, X86::SHLrr32, 0 },  // SAL = SHL
-      };
+    // This is a shift right (SHR).
+    static const unsigned NonConstantOperand[][4] = {
+      { X86::SHRrr8, X86::SHRrr16, X86::SHRrr32, 0 },  // SHR
+      { X86::SARrr8, X86::SARrr16, X86::SARrr32, 0 },  // SAR
+      { X86::SHLrr8, X86::SHLrr16, X86::SHLrr32, 0 },  // SHL
+      { X86::SHLrr8, X86::SHLrr16, X86::SHLrr32, 0 },  // SAL = SHL
+    };
 
-      const unsigned *OpTab = // Figure out the operand table to use
-        NonConstantOperand[isLeftShift*2+isOperandSigned];
+    const unsigned *OpTab = // Figure out the operand table to use
+      NonConstantOperand[isLeftShift*2+isOperandSigned];
 
-      BuildMI(BB, OpTab[OperandClass], 1, DestReg).addReg(Op0r);
-    }
+    BuildMI(BB, OpTab[OperandClass], 1, DestReg).addReg(Op0r);
+  }
+}
+
+
+/// doFPLoad - This method is used to load an FP value from memory using the
+/// current endianness.  NOTE: This method returns a partially constructed load
+/// instruction which needs to have the memory source filled in still.
+///
+MachineInstr *ISel::doFPLoad(const Type *Ty, unsigned DestReg) {
+  assert(Ty == Type::FloatTy || Ty == Type::DoubleTy && "Unknown FP type!");
+  unsigned LoadOpcode = Ty == Type::FloatTy ? X86::FLDr32 : X86::FLDr64;
+
+  if (TM.getTargetData().isLittleEndian()) // fast path...
+    return BuildMI(BB, LoadOpcode, 4, DestReg);
+
+  // If we are big-endian, start by creating an LEA instruction to represent the
+  // address of the memory location to load from...
+  //
+  unsigned SrcAddrReg = makeAnotherReg(Type::UIntTy);
+  MachineInstr *Result = BuildMI(BB, X86::LEAr32, 5, SrcAddrReg);
+
+  // Allocate a temporary stack slot to transform the value into...
+  int FrameIdx = F->getFrameInfo()->CreateStackObject(Ty, TM.getTargetData());
+  unsigned DestAddrReg = makeAnotherReg(Type::UIntTy);
+  addFrameReference(BuildMI(BB, X86::LEAr32, 5, DestAddrReg), FrameIdx);
+
+  // Perform the bswaps 32 bits at a time...
+  unsigned TmpReg1 = makeAnotherReg(Type::UIntTy);
+  unsigned TmpReg2 = makeAnotherReg(Type::UIntTy);
+  addDirectMem(BuildMI(BB, X86::MOVmr32, 4, TmpReg1), SrcAddrReg);
+  BuildMI(BB, X86::BSWAPr32, 1, TmpReg2).addReg(TmpReg1);
+  unsigned Offset = (Ty == Type::DoubleTy) << 2;
+  addRegOffset(BuildMI(BB, X86::MOVrm32, 5),
+	       DestAddrReg, Offset).addReg(TmpReg2);
+  
+  if (Ty == Type::DoubleTy) {   // Swap the other 32 bits of a double value...
+    TmpReg1 = makeAnotherReg(Type::UIntTy);
+    TmpReg2 = makeAnotherReg(Type::UIntTy);
+
+    addRegOffset(BuildMI(BB, X86::MOVmr32, 4, TmpReg1), SrcAddrReg, 4);
+    BuildMI(BB, X86::BSWAPr32, 1, TmpReg2).addReg(TmpReg1);
+    unsigned Offset = (Ty == Type::DoubleTy) << 2;
+    addDirectMem(BuildMI(BB, X86::MOVrm32, 5), DestAddrReg).addReg(TmpReg2);
+  }
+
+  // Now we can reload the final byteswapped result into the final destination.
+  addFrameReference(BuildMI(BB, LoadOpcode, 4, DestReg), FrameIdx);
+  return Result;
 }
 
 
@@ -907,9 +936,7 @@
   switch (Class) {
   default: visitInstruction(I);   // FIXME: Handle longs...
   case cFP: {
-    // FIXME: Handle endian swapping for FP values.
-    unsigned Opcode = I.getType() == Type::FloatTy ? X86::FLDr32 : X86::FLDr64;
-    addDirectMem(BuildMI(BB, Opcode, 4, DestReg), SrcAddrReg);
+    addDirectMem(doFPLoad(I.getType(), DestReg), SrcAddrReg);
     return;
   }
   case cInt:      // Integers of various sizes handled below
@@ -922,17 +949,17 @@
   // in is in the upper part of the eight byte memory image of the pointer.  It
   // also happens to be byte-swapped, but this will be handled later.
   //
-  if (!isLittleEndian && hasLongPointers && isa<PointerType>(I.getType())) {
+  if (!isLittleEndian && hasLongPointers && 
+      (isa<PointerType>(I.getType()) ||
+       I.getType() == Type::LongTy || I.getType() == Type::ULongTy)) {
     unsigned R = makeAnotherReg(Type::UIntTy);
     BuildMI(BB, X86::ADDri32, 2, R).addReg(SrcAddrReg).addZImm(4);
     SrcAddrReg = R;
   }
 
   unsigned IReg = DestReg;
-  if (!isLittleEndian) {  // If big endian we need an intermediate stage
-    IReg = makeAnotherReg(I.getType());
-    std::swap(IReg, DestReg);
-  }
+  if (!isLittleEndian)  // If big endian we need an intermediate stage
+    DestReg = makeAnotherReg(I.getType());
 
   static const unsigned Opcode[] = { X86::MOVmr8, X86::MOVmr16, X86::MOVmr32 };
   addDirectMem(BuildMI(BB, Opcode[Class], 4, DestReg), SrcAddrReg);
@@ -957,7 +984,7 @@
       BuildMI(BB, X86::MOVrr16, 1, X86::AX).addReg(DestReg);
       BuildMI(BB, X86::XCHGrr8, 2).addReg(X86::AL, MOTy::UseAndDef)
                                   .addReg(X86::AH, MOTy::UseAndDef);
-      BuildMI(BB, X86::MOVrr16, 1, DestReg).addReg(X86::AX);
+      BuildMI(BB, X86::MOVrr16, 1, IReg).addReg(X86::AX);
       break;
     default: assert(0 && "Class not handled yet!");
     }
@@ -965,32 +992,70 @@
 }
 
 
+/// doFPStore - This method is used to store an FP value to memory using the
+/// current endianness.
+///
+void ISel::doFPStore(const Type *Ty, unsigned DestAddrReg, unsigned SrcReg) {
+  assert(Ty == Type::FloatTy || Ty == Type::DoubleTy && "Unknown FP type!");
+  unsigned StoreOpcode = Ty == Type::FloatTy ? X86::FSTr32 : X86::FSTr64;
+
+  if (TM.getTargetData().isLittleEndian()) {  // fast path...
+    addDirectMem(BuildMI(BB, StoreOpcode,5), DestAddrReg).addReg(SrcReg);
+    return;
+  }
+
+  // Allocate a temporary stack slot to transform the value into...
+  int FrameIdx = F->getFrameInfo()->CreateStackObject(Ty, TM.getTargetData());
+  unsigned SrcAddrReg = makeAnotherReg(Type::UIntTy);
+  addFrameReference(BuildMI(BB, X86::LEAr32, 5, SrcAddrReg), FrameIdx);
+
+  // Store the value into a temporary stack slot...
+  addDirectMem(BuildMI(BB, StoreOpcode, 5), SrcAddrReg).addReg(SrcReg);
+
+  // Perform the bswaps 32 bits at a time...
+  unsigned TmpReg1 = makeAnotherReg(Type::UIntTy);
+  unsigned TmpReg2 = makeAnotherReg(Type::UIntTy);
+  addDirectMem(BuildMI(BB, X86::MOVmr32, 4, TmpReg1), SrcAddrReg);
+  BuildMI(BB, X86::BSWAPr32, 1, TmpReg2).addReg(TmpReg1);
+  unsigned Offset = (Ty == Type::DoubleTy) << 2;
+  addRegOffset(BuildMI(BB, X86::MOVrm32, 5),
+	       DestAddrReg, Offset).addReg(TmpReg2);
+  
+  if (Ty == Type::DoubleTy) {   // Swap the other 32 bits of a double value...
+    TmpReg1 = makeAnotherReg(Type::UIntTy);
+    TmpReg2 = makeAnotherReg(Type::UIntTy);
+
+    addRegOffset(BuildMI(BB, X86::MOVmr32, 4, TmpReg1), SrcAddrReg, 4);
+    BuildMI(BB, X86::BSWAPr32, 1, TmpReg2).addReg(TmpReg1);
+    unsigned Offset = (Ty == Type::DoubleTy) << 2;
+    addDirectMem(BuildMI(BB, X86::MOVrm32, 5), DestAddrReg).addReg(TmpReg2);
+  }
+}
+
+
 /// visitStoreInst - Implement LLVM store instructions in terms of the x86 'mov'
 /// instruction.
 ///
 void ISel::visitStoreInst(StoreInst &I) {
   bool isLittleEndian  = TM.getTargetData().isLittleEndian();
   bool hasLongPointers = TM.getTargetData().getPointerSize() == 8;
-  unsigned ValReg = getReg(I.getOperand(0));
-  unsigned AddressReg = getReg(I.getOperand(1));
+  unsigned ValReg      = getReg(I.getOperand(0));
+  unsigned AddressReg  = getReg(I.getOperand(1));
 
   unsigned Class = getClass(I.getOperand(0)->getType());
   switch (Class) {
   default: visitInstruction(I);   // FIXME: Handle longs...
-  case cFP: {
-    // FIXME: Handle endian swapping for FP values.
-    unsigned Opcode = I.getOperand(0)->getType() == Type::FloatTy ?
-                            X86::FSTr32 : X86::FSTr64;
-    addDirectMem(BuildMI(BB, Opcode, 1+4), AddressReg).addReg(ValReg);
+  case cFP:
+    doFPStore(I.getOperand(0)->getType(), AddressReg, ValReg);
     return;
-  }
   case cInt:      // Integers of various sizes handled below
   case cShort:
   case cByte: break;
   }
 
   if (!isLittleEndian && hasLongPointers &&
-      isa<PointerType>(I.getOperand(0)->getType())) {
+      (isa<PointerType>(I.getOperand(0)->getType()) ||
+       I.getType() == Type::LongTy || I.getType() == Type::ULongTy)) {
     unsigned R = makeAnotherReg(Type::UIntTy);
     BuildMI(BB, X86::ADDri32, 2, R).addReg(AddressReg).addZImm(4);
     AddressReg = R;
@@ -1026,85 +1091,143 @@
 
 /// visitCastInst - Here we have various kinds of copying with or without
 /// sign extension going on.
-void
-ISel::visitCastInst (CastInst &CI)
-{
-  const Type *targetType = CI.getType ();
-  Value *operand = CI.getOperand (0);
-  unsigned operandReg = getReg (operand);
-  const Type *sourceType = operand->getType ();
-  unsigned destReg = getReg (CI);
-  //
-  // Currently we handle:
-  //
-  // 1) cast * to bool
-  //
-  // 2) cast {sbyte, ubyte} to {sbyte, ubyte}
-  //    cast {short, ushort} to {ushort, short}
-  //    cast {int, uint, ptr} to {int, uint, ptr}
-  //
-  // 3) cast {sbyte, ubyte} to {ushort, short}
-  //    cast {sbyte, ubyte} to {int, uint, ptr}
-  //    cast {short, ushort} to {int, uint, ptr}
-  //
-  // 4) cast {int, uint, ptr} to {short, ushort}
-  //    cast {int, uint, ptr} to {sbyte, ubyte}
-  //    cast {short, ushort} to {sbyte, ubyte}
+void ISel::visitCastInst(CastInst &CI) {
+  const Type *DestTy = CI.getType();
+  Value *Src = CI.getOperand(0);
+  unsigned SrcReg = getReg(Src);
+  const Type *SrcTy = Src->getType();
+  unsigned SrcClass = getClassB(SrcTy);
+  unsigned DestReg = getReg(CI);
+  unsigned DestClass = getClassB(DestTy);
 
   // 1) Implement casts to bool by using compare on the operand followed
   // by set if not zero on the result.
-  if (targetType == Type::BoolTy)
-    {
-      BuildMI (BB, X86::CMPri8, 2).addReg (operandReg).addZImm (0);
-      BuildMI (BB, X86::SETNEr, 1, destReg);
-      return;
-    }
+  if (DestTy == Type::BoolTy) {
+    if (SrcClass == cFP || SrcClass == cLong)
+      visitInstruction(CI);
+    
+    BuildMI(BB, X86::CMPri8, 2).addReg(SrcReg).addZImm(0);
+    BuildMI(BB, X86::SETNEr, 1, DestReg);
+    return;
+  }
 
-  // 2) Implement casts between values of the same type class (as determined
-  // by getClass) by using a register-to-register move.
-  unsigned srcClass = getClassB(sourceType);
-  unsigned targClass = getClass(targetType);
-  static const unsigned regRegMove[] = {
-    X86::MOVrr8, X86::MOVrr16, X86::MOVrr32
+  static const unsigned RegRegMove[] = {
+    X86::MOVrr8, X86::MOVrr16, X86::MOVrr32, X86::FpMOV
   };
 
-  if (srcClass <= cInt && targClass <= cInt && srcClass == targClass) {
-    BuildMI(BB, regRegMove[srcClass], 1, destReg).addReg(operandReg);
+  // Implement casts between values of the same type class (as determined by
+  // getClass) by using a register-to-register move.
+  if (SrcClass == DestClass) {
+    if (SrcClass <= cInt || (SrcClass == cFP && SrcTy == DestTy)) {
+      BuildMI(BB, RegRegMove[SrcClass], 1, DestReg).addReg(SrcReg);
+    } else if (SrcClass == cFP) {
+      if (SrcTy == Type::FloatTy) {  // double -> float
+	assert(DestTy == Type::DoubleTy && "Unknown cFP member!");
+	BuildMI(BB, X86::FpMOV, 1, DestReg).addReg(SrcReg);
+      } else {                       // float -> double
+	assert(SrcTy == Type::DoubleTy && DestTy == Type::FloatTy &&
+	       "Unknown cFP member!");
+	// Truncate from double to float by storing to memory as short, then
+	// reading it back.
+	unsigned FltAlign = TM.getTargetData().getFloatAlignment();
+        int FrameIdx = F->getFrameInfo()->CreateStackObject(4, FltAlign);
+	addFrameReference(BuildMI(BB, X86::FSTr32, 5), FrameIdx).addReg(SrcReg);
+	addFrameReference(BuildMI(BB, X86::FLDr32, 5, DestReg), FrameIdx);
+      }
+    } else {
+      visitInstruction(CI);
+    }
     return;
   }
-  // 3) Handle cast of SMALLER int to LARGER int using a move with sign
-  // extension or zero extension, depending on whether the source type
-  // was signed.
-  if ((srcClass <= cInt) && (targClass <= cInt) && (srcClass < targClass))
-    {
-      static const unsigned ops[] = {
-	X86::MOVSXr16r8, X86::MOVSXr32r8, X86::MOVSXr32r16,
-	X86::MOVZXr16r8, X86::MOVZXr32r8, X86::MOVZXr32r16
-      };
-      unsigned srcSigned = sourceType->isSigned ();
-      BuildMI (BB, ops[3 * srcSigned + srcClass + targClass - 1], 1,
-	       destReg).addReg (operandReg);
-      return;
-    }
-  // 4) Handle cast of LARGER int to SMALLER int using a move to EAX
-  // followed by a move out of AX or AL.
-  if ((srcClass <= cInt) && (targClass <= cInt) && (srcClass > targClass))
-    {
-      static const unsigned AReg[] = { X86::AL, X86::AX, X86::EAX };
-      BuildMI (BB, regRegMove[srcClass], 1,
-	       AReg[srcClass]).addReg (operandReg);
-      BuildMI (BB, regRegMove[targClass], 1, destReg).addReg (AReg[srcClass]);
-      return;
-    }
+
+  // Handle cast of SMALLER int to LARGER int using a move with sign extension
+  // or zero extension, depending on whether the source type was signed.
+  if (SrcClass <= cInt && DestClass <= cInt && SrcClass < DestClass) {
+    static const unsigned Opc[][3] = {
+      { X86::MOVSXr16r8, X86::MOVSXr32r8, X86::MOVSXr32r16 }, //   signed
+      { X86::MOVZXr16r8, X86::MOVZXr32r8, X86::MOVZXr32r16 }  // unsigned
+    };
+    
+    BuildMI(BB, Opc[SrcTy->isUnsigned()][SrcClass + DestClass - 1], 1,
+            DestReg).addReg(SrcReg);
+    return;
+  }
+  
+  // Handle cast of LARGER int to SMALLER int using a move to EAX followed by a
+  // move out of AX or AL.
+  if (SrcClass <= cInt && DestClass <= cInt && SrcClass > DestClass) {
+    static const unsigned AReg[] = { X86::AL, X86::AX, X86::EAX };
+    BuildMI(BB, RegRegMove[SrcClass], 1, AReg[SrcClass]).addReg(SrcReg);
+    BuildMI(BB, RegRegMove[DestClass], 1, DestReg).addReg(AReg[DestClass]);
+    return;
+  }
+
+  // Handle casts from integer to floating point now...
+  if (DestClass == cFP) {
+    // unsigned int -> load as 64 bit int.
+    // unsigned long long -> more complex
+    if (SrcTy->isUnsigned() && SrcTy != Type::UByteTy)
+      visitInstruction(CI);  // don't handle unsigned src yet!
+
+    // We don't have the facilities for directly loading byte sized data from
+    // memory.  Promote it to 16 bits.
+    if (SrcClass == cByte) {
+      unsigned TmpReg = makeAnotherReg(Type::ShortTy);
+      BuildMI(BB, SrcTy->isSigned() ? X86::MOVSXr16r8 : X86::MOVZXr16r8,
+	      1, TmpReg).addReg(SrcReg);
+      SrcTy = Type::ShortTy;     // Pretend the short is our input now!
+      SrcClass = cShort;
+      SrcReg = TmpReg;
+    }
+
+    // Spill the integer to memory and reload it from there...
+    int FrameIdx =
+      F->getFrameInfo()->CreateStackObject(SrcTy, TM.getTargetData());
+
+    if (SrcClass > cInt) visitInstruction(CI);
+    static const unsigned Op1[] = { X86::MOVrm8, X86::MOVrm16, X86::MOVrm32 };
+    addFrameReference(BuildMI(BB, Op1[SrcClass], 5), FrameIdx).addReg(SrcReg);
+
+    static const unsigned Op2[] =
+      { 0, X86::FILDr16, X86::FILDr32, 0, X86::FILDr64 };
+    addFrameReference(BuildMI(BB, Op2[SrcClass], 5, DestReg), FrameIdx);
+    return;
+  }
+
+  // Handle casts from floating point to integer now...
+  if (SrcClass == cFP) {
+    // unsigned long long -> more complex
+    if (SrcClass == cLong)
+      visitInstruction(CI);  // don't handle unsigned src yet!
+
+    // We don't have the facilities for directly storing byte sized data to
+    // memory.  Promote it to 16 bits.  We also must promote unsigned values to
+    // larger classes because we only have signed FP stores.
+    unsigned StoreClass  = DestClass;
+    const Type *StoreTy  = DestTy;
+    if (StoreClass == cByte || DestTy->isUnsigned())
+      switch (StoreClass) {
+      case cByte:  StoreTy = Type::ShortTy; StoreClass = cShort; break;
+      case cShort: StoreTy = Type::IntTy;   StoreClass = cInt;   break;
+      case cInt:   StoreTy = Type::LongTy;  StoreClass = cLong;  break;
+      default: assert(0 && "Unknown store class!");
+      }
+
+    // Spill the integer to memory and reload it from there...
+    int FrameIdx =
+      F->getFrameInfo()->CreateStackObject(StoreTy, TM.getTargetData());
+
+    static const unsigned Op1[] =
+      { 0, X86::FISTr16, X86::FISTr32, 0, X86::FISTPr64 };
+    addFrameReference(BuildMI(BB, Op1[StoreClass], 5), FrameIdx).addReg(SrcReg);
+
+    if (DestClass > cInt) visitInstruction(CI);
+    static const unsigned Op2[] = { X86::MOVmr8, X86::MOVmr16, X86::MOVmr32 };
+    addFrameReference(BuildMI(BB, Op2[DestClass], 5, DestReg), FrameIdx);
+    return;
+  }
+
   // Anything we haven't handled already, we can't (yet) handle at all.
-  //
-  // FP to integral casts can be handled with FISTP to store onto the
-  // stack while converting to integer, followed by a MOV to load from
-  // the stack into the result register. Integral to FP casts can be
-  // handled with MOV to store onto the stack, followed by a FILD to
-  // load from the stack while converting to FP. For the moment, I
-  // can't quite get straight in my head how to borrow myself some
-  // stack space and write on it. Otherwise, this would be trivial.
   visitInstruction (CI);
 }
 
@@ -1164,9 +1287,13 @@
       unsigned idxValue = CUI->getValue();
       unsigned memberOffset =
 	TD.getStructLayout(StTy)->MemberOffsets[idxValue];
-      // Emit an ADD to add memberOffset to the basePtr.
-      BMI(MBB, IP, X86::ADDri32, 2,
-          nextBasePtrReg).addReg(basePtrReg).addZImm(memberOffset);
+      if (memberOffset) {
+	// Emit an ADD to add memberOffset to the basePtr.
+	BMI(MBB, IP, X86::ADDri32, 2,
+	    nextBasePtrReg).addReg(basePtrReg).addZImm(memberOffset);
+      } else {
+	BMI(MBB, IP, X86::MOVrr32, 1, nextBasePtrReg).addReg(basePtrReg);
+      }
       // The next type is the member of the structure selected by the
       // index.
       Ty = StTy->getElementTypes()[idxValue];
@@ -1276,7 +1403,7 @@
   BuildMI(BB, X86::ANDri32, 2, AlignedSize).addReg(AddedSizeReg).addZImm(~15);
   
   // Subtract size from stack pointer, thereby allocating some space.
-  BuildMI(BB, X86::SUBri32, 2, X86::ESP).addReg(X86::ESP).addZImm(AlignedSize);
+  BuildMI(BB, X86::SUBrr32, 2, X86::ESP).addReg(X86::ESP).addReg(AlignedSize);
 
   // Put a pointer to the space into the result register, by copying
   // the stack pointer.