[llvm-commits] CVS: llvm/lib/Target/X86/X86.h X86.td X86CodeEmitter.cpp X86ISelPattern.cpp X86InstrInfo.cpp X86InstrInfo.h X86InstrInfo.td X86RegisterInfo.cpp X86RegisterInfo.td X86TargetMachine.cpp

Wed Jul 6 11:59:15 PDT 2005

Changes in directory llvm/lib/Target/X86:

X86.h updated: 1.32 -> 1.33
X86.td updated: 1.14 -> 1.15
X86CodeEmitter.cpp updated: 1.80 -> 1.81
X86ISelPattern.cpp updated: 1.144 -> 1.145
X86InstrInfo.cpp updated: 1.38 -> 1.39
X86InstrInfo.h updated: 1.46 -> 1.47
X86InstrInfo.td updated: 1.128 -> 1.129
X86RegisterInfo.cpp updated: 1.105 -> 1.106
X86RegisterInfo.td updated: 1.18 -> 1.19
X86TargetMachine.cpp updated: 1.80 -> 1.81
---
Log message:

First round of support for doing scalar FP using the SSE2 ISA extension and
XMM registers.  There are many known deficiencies and fixmes, which will be
addressed ASAP.  The major benefit of this work is that it will allow the
LLVM register allocator to allocate FP registers across basic blocks.

The x86 backend will still default to x87 style FP.  To enable this work,
you must pass -enable-sse-scalar-fp and either -sse2 or -sse3 to llc.

An example before and after would be for:
double foo(double *P) { double Sum = 0; int i; for (i = 0; i < 1000; ++i) 
                        Sum += P[i]; return Sum; }

The inner loop looks like the following:
x87:
.LBB_foo_1:     # no_exit
        fldl (%esp)
        faddl (%eax,%ecx,8)
        fstpl (%esp)
        incl %ecx
        cmpl $1000, %ecx
        #FP_REG_KILL
        jne .LBB_foo_1  # no_exit

SSE2:
        addsd (%eax,%ecx,8), %xmm0
        incl %ecx
        cmpl $1000, %ecx
        #FP_REG_KILL
        jne .LBB_foo_1  # no_exit


---
Diffs of the changes:  (+382 -91)

 X86.h                |    1 
 X86.td               |    2 
 X86CodeEmitter.cpp   |   14 +
 X86ISelPattern.cpp   |  395 +++++++++++++++++++++++++++++++++++++++++----------
 X86InstrInfo.cpp     |    2 
 X86InstrInfo.h       |    4 
 X86InstrInfo.td      |   20 ++
 X86RegisterInfo.cpp  |   22 ++
 X86RegisterInfo.td   |    4 
 X86TargetMachine.cpp |    9 -
 10 files changed, 382 insertions(+), 91 deletions(-)


Index: llvm/lib/Target/X86/X86.h
diff -u llvm/lib/Target/X86/X86.h:1.32 llvm/lib/Target/X86/X86.h:1.33

--- llvm/lib/Target/X86/X86.h:1.32	Mon Jun 27 01:30:12 2005
+++ llvm/lib/Target/X86/X86.h	Wed Jul  6 13:59:03 2005
@@ -28,6 +28,7 @@
 };
 
 extern X86VectorEnum X86Vector;
+extern bool X86ScalarSSE;
 
 /// createX86SimpleInstructionSelector - This pass converts an LLVM function
 /// into a machine code representation in a very simple peep-hole fashion.  The


Index: llvm/lib/Target/X86/X86.td
diff -u llvm/lib/Target/X86/X86.td:1.14 llvm/lib/Target/X86/X86.td:1.15
--- llvm/lib/Target/X86/X86.td:1.14	Sun Oct  3 15:36:57 2004
+++ llvm/lib/Target/X86/X86.td	Wed Jul  6 13:59:03 2005
@@ -61,7 +61,7 @@
 
 def X86 : Target {
   // Specify the callee saved registers.
-  let CalleeSavedRegisters = [ESI, EDI, EBX, EBP];
+  let CalleeSavedRegisters = [ESI, EDI, EBX, EBP, XMM4, XMM5, XMM6, XMM7];
 
   // Yes, pointers are 32-bits in size.
   let PointerType = i32;


Index: llvm/lib/Target/X86/X86CodeEmitter.cpp
diff -u llvm/lib/Target/X86/X86CodeEmitter.cpp:1.80 llvm/lib/Target/X86/X86CodeEmitter.cpp:1.81
--- llvm/lib/Target/X86/X86CodeEmitter.cpp:1.80	Thu May 19 00:54:33 2005
+++ llvm/lib/Target/X86/X86CodeEmitter.cpp	Wed Jul  6 13:59:03 2005
@@ -361,8 +361,18 @@
   // Emit the repeat opcode prefix as needed.
   if ((Desc.TSFlags & X86II::Op0Mask) == X86II::REP) MCE.emitByte(0xF3);
 
-  // Emit instruction prefixes if necessary
-  if (Desc.TSFlags & X86II::OpSize) MCE.emitByte(0x66);// Operand size...
+  // Emit the operand size opcode prefix as needed.
+  if (Desc.TSFlags & X86II::OpSize) MCE.emitByte(0x66);
+
+  // Emit the double precision sse fp opcode prefix as needed.
+  if ((Desc.TSFlags & X86II::Op0Mask) == X86II::XD) {
+    MCE.emitByte(0xF2); MCE.emitByte(0x0F);
+  }
+
+  // Emit the double precision sse fp opcode prefix as needed.
+  if ((Desc.TSFlags & X86II::Op0Mask) == X86II::XS) {
+    MCE.emitByte(0xF3); MCE.emitByte(0x0F);
+  }
 
   switch (Desc.TSFlags & X86II::Op0Mask) {
   case X86II::TB:


Index: llvm/lib/Target/X86/X86ISelPattern.cpp
diff -u llvm/lib/Target/X86/X86ISelPattern.cpp:1.144 llvm/lib/Target/X86/X86ISelPattern.cpp:1.145
--- llvm/lib/Target/X86/X86ISelPattern.cpp:1.144	Tue Jul  5 14:58:54 2005
+++ llvm/lib/Target/X86/X86ISelPattern.cpp	Wed Jul  6 13:59:03 2005
@@ -97,15 +97,13 @@
       setShiftAmountFlavor(Mask);   // shl X, 32 == shl X, 0
 
       // Set up the register classes.
+      // FIXME: Eliminate these two classes when legalize can handle promotions
+      // well.
+      addRegisterClass(MVT::i1, X86::R8RegisterClass);
       addRegisterClass(MVT::i8, X86::R8RegisterClass);
       addRegisterClass(MVT::i16, X86::R16RegisterClass);
       addRegisterClass(MVT::i32, X86::R32RegisterClass);
-      addRegisterClass(MVT::f64, X86::RFPRegisterClass);
-
-      // FIXME: Eliminate these two classes when legalize can handle promotions
-      // well.
-/**/  addRegisterClass(MVT::i1, X86::R8RegisterClass);
-
+      
       setOperationAction(ISD::SINT_TO_FP       , MVT::i64  , Custom);
       setOperationAction(ISD::BRCONDTWOWAY     , MVT::Other, Expand);
       setOperationAction(ISD::MEMMOVE          , MVT::Other, Expand);
@@ -123,7 +121,7 @@
       setOperationAction(ISD::CTPOP            , MVT::i32  , Expand);
       setOperationAction(ISD::CTTZ             , MVT::i32  , Expand);
       setOperationAction(ISD::CTLZ             , MVT::i32  , Expand);
-
+      
       setOperationAction(ISD::READIO           , MVT::i1   , Expand);
       setOperationAction(ISD::READIO           , MVT::i8   , Expand);
       setOperationAction(ISD::READIO           , MVT::i16  , Expand);
@@ -132,24 +130,47 @@
       setOperationAction(ISD::WRITEIO          , MVT::i8   , Expand);
       setOperationAction(ISD::WRITEIO          , MVT::i16  , Expand);
       setOperationAction(ISD::WRITEIO          , MVT::i32  , Expand);
-
-      if (!UnsafeFPMath) {
-        setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
-        setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
-      }
-
+      
       // These should be promoted to a larger select which is supported.
-/**/  setOperationAction(ISD::SELECT           , MVT::i1   , Promote);
+      setOperationAction(ISD::SELECT           , MVT::i1   , Promote);
       setOperationAction(ISD::SELECT           , MVT::i8   , Promote);
-
+      
+      if (X86ScalarSSE) {
+        // Set up the FP register classes.
+        addRegisterClass(MVT::f32, X86::RXMMRegisterClass);
+        addRegisterClass(MVT::f64, X86::RXMMRegisterClass);
+        
+        setOperationAction(ISD::EXTLOAD,  MVT::f32, Expand);
+        setOperationAction(ISD::ZEXTLOAD, MVT::f32, Expand);
+        
+        // We don't support sin/cos/sqrt/fmod
+        setOperationAction(ISD::FSIN , MVT::f64, Expand);
+        setOperationAction(ISD::FCOS , MVT::f64, Expand);
+        setOperationAction(ISD::FABS , MVT::f64, Expand);
+        setOperationAction(ISD::FNEG , MVT::f64, Expand);
+        setOperationAction(ISD::SREM , MVT::f64, Expand);
+        setOperationAction(ISD::FSIN , MVT::f32, Expand);
+        setOperationAction(ISD::FCOS , MVT::f32, Expand);
+        setOperationAction(ISD::FABS , MVT::f32, Expand);
+        setOperationAction(ISD::FNEG , MVT::f32, Expand);
+        setOperationAction(ISD::SREM , MVT::f32, Expand);
+      } else {
+        // Set up the FP register classes.
+        addRegisterClass(MVT::f64, X86::RFPRegisterClass);
+        
+        if (!UnsafeFPMath) {
+          setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
+          setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
+        }
+        
+        addLegalFPImmediate(+0.0); // FLD0
+        addLegalFPImmediate(+1.0); // FLD1
+        addLegalFPImmediate(-0.0); // FLD0/FCHS
+        addLegalFPImmediate(-1.0); // FLD1/FCHS
+      }
       computeRegisterProperties();
-
-      addLegalFPImmediate(+0.0); // FLD0
-      addLegalFPImmediate(+1.0); // FLD1
-      addLegalFPImmediate(-0.0); // FLD0/FCHS
-      addLegalFPImmediate(-1.0); // FLD1/FCHS
     }
-
+    
     // Return the number of bytes that a function should pop when it returns (in
     // addition to the space used by the return address).
     //
@@ -400,7 +421,10 @@
     RetVals.push_back(MVT::i32);
     break;
   case MVT::f32:
-    RetVals.push_back(MVT::f64);
+    if (X86ScalarSSE)
+      RetVals.push_back(MVT::f32);
+    else
+      RetVals.push_back(MVT::f64);
     break;
   case MVT::i64:
     RetVals.push_back(MVT::i32);
@@ -805,7 +829,10 @@
     RetVals.push_back(MVT::i32);
     break;
   case MVT::f32:
-    RetVals.push_back(MVT::f64);
+    if (X86ScalarSSE)
+      RetVals.push_back(MVT::f32);
+    else
+      RetVals.push_back(MVT::f64);
     break;
   case MVT::i64:
     RetVals.push_back(MVT::i32);
@@ -1041,6 +1068,8 @@
         BuildMI(BB, X86::MOV32rr, 1, LI->second).addReg(LI->first);
       } else if (RC == X86::RFPRegisterClass) {
         BuildMI(BB, X86::FpMOV, 1, LI->second).addReg(LI->first);
+      } else if (RC == X86::RXMMRegisterClass) {
+        BuildMI(BB, X86::MOVAPDrr, 1, LI->second).addReg(LI->first);
       } else {
         assert(0 && "Unknown regclass!");
       }
@@ -1641,6 +1670,11 @@
     /*missing*/0,  /*missing*/0, X86::FCMOVB , X86::FCMOVBE,
     X86::FCMOVA ,  X86::FCMOVAE, X86::FCMOVP , X86::FCMOVNP
   };
+  static const unsigned SSE_CMOVTAB[] = {
+    0 /* CMPEQSS */, 4 /* CMPNEQSS */, 1 /* CMPLTSS */, 2 /* CMPLESS */,
+    2 /* CMPLESS */, 1 /* CMPLTSS */, /*missing*/0, /*missing*/0,
+    /*missing*/0,  /*missing*/0, /*missing*/0, /*missing*/0
+  };
 
   if (SetCCSDNode *SetCC = dyn_cast<SetCCSDNode>(Cond)) {
     if (MVT::isInteger(SetCC->getOperand(0).getValueType())) {
@@ -1657,6 +1691,20 @@
       case ISD::SETULE: CondCode = BE; break;
       case ISD::SETUGE: CondCode = AE; break;
       }
+    } else if (X86ScalarSSE) {
+      switch (SetCC->getCondition()) {
+      default: assert(0 && "Unknown scalar fp comparison!");
+      case ISD::SETEQ:  CondCode = EQ; break;
+      case ISD::SETNE:  CondCode = NE; break;
+      case ISD::SETULT:
+      case ISD::SETLT:  CondCode = LT; break;
+      case ISD::SETULE:
+      case ISD::SETLE:  CondCode = LE; break;
+      case ISD::SETUGT:
+      case ISD::SETGT:  CondCode = GT; break;
+      case ISD::SETUGE:
+      case ISD::SETGE:  CondCode = GE; break;
+      }
     } else {
       // On a floating point condition, the flags are set as follows:
       // ZF  PF  CF   op
@@ -1693,6 +1741,79 @@
     }
   }
 
+  // There's no SSE equivalent of FCMOVE.  In some cases we can fake it up, in
+  // Others we will have to do the PowerPC thing and generate an MBB for the
+  // true and false values and select between them with a PHI.
+  if (X86ScalarSSE) { 
+    if (CondCode != NOT_SET) {
+      unsigned CMPSOpc = (SVT == MVT::f64) ? X86::CMPSDrr : X86::CMPSSrr;
+      unsigned CMPSImm = SSE_CMOVTAB[CondCode];
+      // FIXME check for min
+      // FIXME check for max
+      // FIXME check for reverse
+      unsigned LHS = SelectExpr(Cond.getOperand(0));
+      unsigned RHS = SelectExpr(Cond.getOperand(1));
+      // emit compare mask
+      unsigned MaskReg = MakeReg(SVT);
+      BuildMI(BB, CMPSOpc, 3, MaskReg).addReg(LHS).addReg(RHS).addImm(CMPSImm);
+      // emit and with mask
+      unsigned TrueMask = MakeReg(SVT);
+      unsigned AndOpc = (SVT == MVT::f32) ? X86::ANDPSrr : X86::ANDPDrr;
+      BuildMI(BB, AndOpc, 2, TrueMask).addReg(RTrue).addReg(MaskReg);
+      // emit and with inverse mask
+      unsigned FalseMask = MakeReg(SVT);
+      unsigned AndnOpc = (SVT == MVT::f32) ? X86::ANDNPSrr : X86::ANDNPDrr;
+      BuildMI(BB, AndnOpc, 2, FalseMask).addReg(RFalse).addReg(MaskReg);
+      // emit or into dest reg
+      unsigned OROpc = (SVT == MVT::f32) ? X86::ORPSrr : X86::ORPDrr;
+      BuildMI(BB, OROpc, 2, RDest).addReg(TrueMask).addReg(FalseMask);
+      return;
+    } else {
+      // do the test and branch thing
+      // Get the condition into the zero flag.
+      unsigned CondReg = SelectExpr(Cond);
+      BuildMI(BB, X86::TEST8rr, 2).addReg(CondReg).addReg(CondReg);
+
+      // Create an iterator with which to insert the MBB for copying the false
+      // value and the MBB to hold the PHI instruction for this SetCC.
+      MachineBasicBlock *thisMBB = BB;
+      const BasicBlock *LLVM_BB = BB->getBasicBlock();
+      ilist<MachineBasicBlock>::iterator It = BB;
+      ++It;
+
+      //  thisMBB:
+      //  ...
+      //   TrueVal = ...
+      //   cmpTY ccX, r1, r2
+      //   bCC sinkMBB
+      //   fallthrough --> copy0MBB
+      MachineBasicBlock *copy0MBB = new MachineBasicBlock(LLVM_BB);
+      MachineBasicBlock *sinkMBB = new MachineBasicBlock(LLVM_BB);
+      BuildMI(BB, X86::JNE, 1).addMBB(sinkMBB);
+      MachineFunction *F = BB->getParent();
+      F->getBasicBlockList().insert(It, copy0MBB);
+      F->getBasicBlockList().insert(It, sinkMBB);
+      // Update machine-CFG edges
+      BB->addSuccessor(copy0MBB);
+      BB->addSuccessor(sinkMBB);
+
+      //  copy0MBB:
+      //   %FalseValue = ...
+      //   # fallthrough to sinkMBB
+      BB = copy0MBB;
+      // Update machine-CFG edges
+      BB->addSuccessor(sinkMBB);
+
+      //  sinkMBB:
+      //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+      //  ...
+      BB = sinkMBB;
+      BuildMI(BB, X86::PHI, 4, RDest).addReg(RFalse)
+        .addMBB(copy0MBB).addReg(RTrue).addMBB(thisMBB);
+    }
+    return;
+  }
+
   unsigned Opc = 0;
   if (CondCode != NOT_SET) {
     switch (SVT) {
@@ -1702,7 +1823,7 @@
     case MVT::f64: Opc = CMOVTABFP[CondCode]; break;
     }
   }
-
+  
   // Finally, if we weren't able to fold this, just emit the condition and test
   // it.
   if (CondCode == NOT_SET || Opc == 0) {
@@ -1757,8 +1878,8 @@
       return;
     }
   } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(RHS)) {
-    if (CN->isExactlyValue(+0.0) ||
-        CN->isExactlyValue(-0.0)) {
+    if (!X86ScalarSSE && (CN->isExactlyValue(+0.0) ||
+                          CN->isExactlyValue(-0.0))) {
       unsigned Reg = SelectExpr(LHS);
       BuildMI(BB, X86::FTST, 1).addReg(Reg);
       BuildMI(BB, X86::FNSTSW8r, 0);
@@ -1791,7 +1912,8 @@
   case MVT::i8:  Opc = X86::CMP8rr;  break;
   case MVT::i16: Opc = X86::CMP16rr; break;
   case MVT::i32: Opc = X86::CMP32rr; break;
-  case MVT::f64: Opc = X86::FUCOMIr; break;
+  case MVT::f32: Opc = X86::UCOMISSrr; break;
+  case MVT::f64: Opc = X86ScalarSSE ? X86::UCOMISDrr : X86::FUCOMIr; break;
   }
   unsigned Tmp1, Tmp2;
   if (getRegPressure(LHS) > getRegPressure(RHS)) {
@@ -2040,6 +2162,11 @@
   default:
     Node->dump();
     assert(0 && "Node not handled!\n");
+  case ISD::FP_EXTEND:
+    assert(X86ScalarSSE && "Scalar SSE FP must be enabled to use f32"); 
+    Tmp1 = SelectExpr(N.getOperand(0));
+    BuildMI(BB, X86::CVTSS2SDrr, 1, Result).addReg(Tmp1);
+    return Result;
   case ISD::CopyFromReg:
     Select(N.getOperand(0));
     if (Result == 1) {
@@ -2212,6 +2339,37 @@
 
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP: {
+    Tmp1 = SelectExpr(N.getOperand(0));  // Get the operand register
+    unsigned PromoteOpcode = 0;
+
+    // We can handle any sint to fp, and 8 and 16 uint to fp with the direct 
+    // sse conversion instructions.
+    if (X86ScalarSSE) {
+      MVT::ValueType SrcTy = N.getOperand(0).getValueType();
+      MVT::ValueType DstTy = N.getValueType();
+      switch (SrcTy) {
+      case MVT::i1:
+      case MVT::i8:
+        PromoteOpcode = (N.getOpcode() == ISD::UINT_TO_FP) ?
+          X86::MOVZX32rr8 : X86::MOVSX32rr8;
+        break;
+      case MVT::i16:
+        PromoteOpcode = (N.getOpcode() == ISD::UINT_TO_FP) ?
+          X86::MOVZX32rr16 : X86::MOVSX32rr16;
+        break;
+      default:
+        assert(N.getOpcode() != ISD::UINT_TO_FP);
+        break;
+      }
+      if (PromoteOpcode) {
+        BuildMI(BB, PromoteOpcode, 1, Tmp2).addReg(Tmp1);
+        Tmp1 = Tmp2;
+      }
+      Opc = (DstTy == MVT::f64) ? X86::CVTSI2SDrr : X86::CVTSI2SSrr;
+      BuildMI(BB, Opc, 1, Result).addReg(Tmp1);
+      return Result;
+    }
+    
     // FIXME: Most of this grunt work should be done by legalize!
     ContainsFPCode = true;
 
@@ -2221,8 +2379,6 @@
     //
     MVT::ValueType PromoteType = MVT::Other;
     MVT::ValueType SrcTy = N.getOperand(0).getValueType();
-    unsigned PromoteOpcode = 0;
-    unsigned RealDestReg = Result;
     switch (SrcTy) {
     case MVT::i1:
     case MVT::i8:
@@ -2245,8 +2401,6 @@
       break;
     }
 
-    Tmp1 = SelectExpr(N.getOperand(0));  // Get the operand register
-
     if (PromoteType != MVT::Other) {
       Tmp2 = MakeReg(PromoteType);
       BuildMI(BB, PromoteOpcode, 1, Tmp2).addReg(Tmp1);
@@ -2272,31 +2426,28 @@
       break;
     default: break; // No promotion required.
     }
-
-    if (Node->getOpcode() == ISD::UINT_TO_FP && Result != RealDestReg) {
-      // If this is a cast from uint -> double, we need to be careful when if
-      // the "sign" bit is set.  If so, we don't want to make a negative number,
-      // we want to make a positive number.  Emit code to add an offset if the
-      // sign bit is set.
-
-      // Compute whether the sign bit is set by shifting the reg right 31 bits.
-      unsigned IsNeg = MakeReg(MVT::i32);
-      BuildMI(BB, X86::SHR32ri, 2, IsNeg).addReg(Tmp1).addImm(31);
-
-      // Create a CP value that has the offset in one word and 0 in the other.
-      static ConstantInt *TheOffset = ConstantUInt::get(Type::ULongTy,
-                                                        0x4f80000000000000ULL);
-      unsigned CPI = F->getConstantPool()->getConstantPoolIndex(TheOffset);
-      BuildMI(BB, X86::FADD32m, 5, RealDestReg).addReg(Result)
-        .addConstantPoolIndex(CPI).addZImm(4).addReg(IsNeg).addSImm(0);
-    }
-    return RealDestReg;
+    return Result;
   }
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT: {
     // FIXME: Most of this grunt work should be done by legalize!
     Tmp1 = SelectExpr(N.getOperand(0));  // Get the operand register
 
+    // If the target supports SSE2 and is performing FP operations in SSE regs
+    // instead of the FP stack, then we can use the efficient CVTSS2SI and
+    // CVTSD2SI instructions.
+    if (ISD::FP_TO_SINT == N.getOpcode() && X86ScalarSSE) {
+      if (MVT::f32 == N.getOperand(0).getValueType()) {
+        BuildMI(BB, X86::CVTSS2SIrr, 1, Result).addReg(Tmp1);
+      } else if (MVT::f64 == N.getOperand(0).getValueType()) {
+        BuildMI(BB, X86::CVTSD2SIrr, 1, Result).addReg(Tmp1);
+      } else {
+        assert(0 && "Not an f32 or f64?");
+        abort();
+      }
+      return Result;
+    } 
+
     // Change the floating point control register to use "round towards zero"
     // mode when truncating to an integer value.
     //
@@ -2385,9 +2536,15 @@
       case MVT::i8:  Opc = X86::ADD8rm;  break;
       case MVT::i16: Opc = X86::ADD16rm; break;
       case MVT::i32: Opc = X86::ADD32rm; break;
+      case MVT::f32: Opc = X86::ADDSSrm; break;
       case MVT::f64:
         // For F64, handle promoted load operations (from F32) as well!
-        Opc = Op1.getOpcode() == ISD::LOAD ? X86::FADD64m : X86::FADD32m;
+        if (X86ScalarSSE) {
+          assert(Op1.getOpcode() == ISD::LOAD && "SSE load not promoted");
+          Opc = X86::ADDSDrm;
+        } else {
+          Opc = Op1.getOpcode() == ISD::LOAD ? X86::FADD64m : X86::FADD32m;
+        }
         break;
       }
       X86AddressMode AM;
@@ -2458,7 +2615,8 @@
     case MVT::i8:  Opc = X86::ADD8rr; break;
     case MVT::i16: Opc = X86::ADD16rr; break;
     case MVT::i32: Opc = X86::ADD32rr; break;
-    case MVT::f64: Opc = X86::FpADD; break;
+    case MVT::f32: Opc = X86::ADDSSrr; break;
+    case MVT::f64: Opc = X86ScalarSSE ? X86::ADDSDrr : X86::FpADD; break;
     }
 
     if (getRegPressure(Op0) > getRegPressure(Op1)) {
@@ -2472,18 +2630,29 @@
     BuildMI(BB, Opc, 2, Result).addReg(Tmp1).addReg(Tmp2);
     return Result;
 
+  case ISD::FSQRT:
+    Tmp1 = SelectExpr(Node->getOperand(0));
+    if (X86ScalarSSE) {
+      Opc = (N.getValueType() == MVT::f32) ? X86::SQRTSSrr : X86::SQRTSDrr;
+      BuildMI(BB, Opc, 1, Result).addReg(Tmp1);
+    } else {
+      BuildMI(BB, X86::FSQRT, 1, Result).addReg(Tmp1);
+    }
+    return Result;
+
+  // FIXME:
+  // Once we can spill 16 byte constants into the constant pool, we can
+  // implement SSE equivalents of FABS and FCHS.
   case ISD::FABS:
   case ISD::FNEG:
   case ISD::FSIN:
   case ISD::FCOS:
-  case ISD::FSQRT:
     assert(N.getValueType()==MVT::f64 && "Illegal type for this operation");
     Tmp1 = SelectExpr(Node->getOperand(0));
     switch (N.getOpcode()) {
     default: assert(0 && "Unreachable!");
     case ISD::FABS: BuildMI(BB, X86::FABS, 1, Result).addReg(Tmp1); break;
     case ISD::FNEG: BuildMI(BB, X86::FCHS, 1, Result).addReg(Tmp1); break;
-    case ISD::FSQRT: BuildMI(BB, X86::FSQRT, 1, Result).addReg(Tmp1); break;
     case ISD::FSIN: BuildMI(BB, X86::FSIN, 1, Result).addReg(Tmp1); break;
     case ISD::FCOS: BuildMI(BB, X86::FCOS, 1, Result).addReg(Tmp1); break;
     }
@@ -2550,11 +2719,21 @@
       X86::SUB8rm, X86::SUB16rm, X86::SUB32rm, X86::FSUB32m, X86::FSUB64m,
       X86::SUB8rr, X86::SUB16rr, X86::SUB32rr, X86::FpSUB  , X86::FpSUB,
     };
+    static const unsigned SSE_SUBTab[] = {
+      X86::SUB8ri, X86::SUB16ri, X86::SUB32ri, 0, 0,
+      X86::SUB8rm, X86::SUB16rm, X86::SUB32rm, X86::SUBSSrm, X86::SUBSDrm,
+      X86::SUB8rr, X86::SUB16rr, X86::SUB32rr, X86::SUBSSrr, X86::SUBSDrr,
+    };
     static const unsigned MULTab[] = {
       0, X86::IMUL16rri, X86::IMUL32rri, 0, 0,
       0, X86::IMUL16rm , X86::IMUL32rm, X86::FMUL32m, X86::FMUL64m,
       0, X86::IMUL16rr , X86::IMUL32rr, X86::FpMUL  , X86::FpMUL,
     };
+    static const unsigned SSE_MULTab[] = {
+      0, X86::IMUL16rri, X86::IMUL32rri, 0, 0,
+      0, X86::IMUL16rm , X86::IMUL32rm, X86::MULSSrm, X86::MULSDrm,
+      0, X86::IMUL16rr , X86::IMUL32rr, X86::MULSSrr, X86::MULSDrr,
+    };
     static const unsigned ANDTab[] = {
       X86::AND8ri, X86::AND16ri, X86::AND32ri, 0, 0,
       X86::AND8rm, X86::AND16rm, X86::AND32rm, 0, 0,
@@ -2637,8 +2816,8 @@
       }
       switch (Node->getOpcode()) {
       default: assert(0 && "Unreachable!");
-      case ISD::SUB: Opc = SUBTab[Opc]; break;
-      case ISD::MUL: Opc = MULTab[Opc]; break;
+      case ISD::SUB: Opc = X86ScalarSSE ? SSE_SUBTab[Opc] : SUBTab[Opc]; break;
+      case ISD::MUL: Opc = X86ScalarSSE ? SSE_MULTab[Opc] : MULTab[Opc]; break;
       case ISD::AND: Opc = ANDTab[Opc]; break;
       case ISD::OR:  Opc =  ORTab[Opc]; break;
       case ISD::XOR: Opc = XORTab[Opc]; break;
@@ -2656,7 +2835,7 @@
         goto FoldOps;
       } else {
         // For FP, emit 'reverse' subract, with a memory operand.
-        if (N.getValueType() == MVT::f64) {
+        if (N.getValueType() == MVT::f64 && !X86ScalarSSE) {
           if (Op0.getOpcode() == ISD::EXTLOAD)
             Opc = X86::FSUBR32m;
           else
@@ -2678,13 +2857,17 @@
       case MVT::i8:  Opc = 5; break;
       case MVT::i16: Opc = 6; break;
       case MVT::i32: Opc = 7; break;
+      case MVT::f32: Opc = 8; break;
         // For F64, handle promoted load operations (from F32) as well!
-      case MVT::f64: Opc = Op1.getOpcode() == ISD::LOAD ? 9 : 8; break;
+      case MVT::f64: 
+        assert((!X86ScalarSSE || Op1.getOpcode() == ISD::LOAD) && 
+               "SSE load should have been promoted");
+        Opc = Op1.getOpcode() == ISD::LOAD ? 9 : 8; break;
       }
       switch (Node->getOpcode()) {
       default: assert(0 && "Unreachable!");
-      case ISD::SUB: Opc = SUBTab[Opc]; break;
-      case ISD::MUL: Opc = MULTab[Opc]; break;
+      case ISD::SUB: Opc = X86ScalarSSE ? SSE_SUBTab[Opc] : SUBTab[Opc]; break;
+      case ISD::MUL: Opc = X86ScalarSSE ? SSE_MULTab[Opc] : MULTab[Opc]; break;
       case ISD::AND: Opc = ANDTab[Opc]; break;
       case ISD::OR:  Opc =  ORTab[Opc]; break;
       case ISD::XOR: Opc = XORTab[Opc]; break;
@@ -2725,8 +2908,8 @@
     }
     switch (Node->getOpcode()) {
     default: assert(0 && "Unreachable!");
-    case ISD::SUB: Opc = SUBTab[Opc]; break;
-    case ISD::MUL: Opc = MULTab[Opc]; break;
+    case ISD::SUB: Opc = X86ScalarSSE ? SSE_SUBTab[Opc] : SUBTab[Opc]; break;
+    case ISD::MUL: Opc = X86ScalarSSE ? SSE_MULTab[Opc] : MULTab[Opc]; break;
     case ISD::AND: Opc = ANDTab[Opc]; break;
     case ISD::OR:  Opc =  ORTab[Opc]; break;
     case ISD::XOR: Opc = XORTab[Opc]; break;
@@ -2844,7 +3027,7 @@
 
     if (N.getOpcode() == ISD::SDIV) {
       // We can fold loads into FpDIVs, but not really into any others.
-      if (N.getValueType() == MVT::f64) {
+      if (N.getValueType() == MVT::f64 || !X86ScalarSSE) {
         // Check for reversed and unreversed DIV.
         if (isFoldableLoad(N.getOperand(0), N.getOperand(1), true)) {
           if (N.getOperand(0).getOpcode() == ISD::EXTLOAD)
@@ -2962,8 +3145,12 @@
       ClrOpcode = X86::MOV32ri;
       SExtOpcode = X86::CDQ;
       break;
+    case MVT::f32:
+      BuildMI(BB, X86::DIVSSrr, 2, Result).addReg(Tmp1).addReg(Tmp2);
+      return Result;
     case MVT::f64:
-      BuildMI(BB, X86::FpDIV, 2, Result).addReg(Tmp1).addReg(Tmp2);
+      Opc = X86ScalarSSE ? X86::DIVSDrr : X86::FpDIV;
+      BuildMI(BB, Opc, 2, Result).addReg(Tmp1).addReg(Tmp2);
       return Result;
     }
 
@@ -3108,7 +3295,15 @@
     case MVT::i8:  Opc = X86::MOV8rm; break;
     case MVT::i16: Opc = X86::MOV16rm; break;
     case MVT::i32: Opc = X86::MOV32rm; break;
-    case MVT::f64: Opc = X86::FLD64m; ContainsFPCode = true; break;
+    case MVT::f32: Opc = X86::MOVSSrm; break;
+    case MVT::f64: 
+      if (X86ScalarSSE) {
+        Opc = X86::MOVSDrm;
+      } else {
+        Opc = X86::FLD64m;
+        ContainsFPCode = true; 
+      }
+      break;
     }
 
     if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N.getOperand(1))){
@@ -3385,9 +3580,21 @@
           BuildMI(BB, X86::MOV32rr, 1, Result+1).addReg(X86::EDX);
         break;
       case MVT::f64:     // Floating-point return values live in %ST(0)
-        ContainsFPCode = true;
-        BuildMI(BB, X86::FpGETRESULT, 1, Result);
-        break;
+        if (X86ScalarSSE) {
+          ContainsFPCode = true;
+          BuildMI(BB, X86::FpGETRESULT, 1, X86::FP0);
+
+          unsigned Size = MVT::getSizeInBits(MVT::f64)/8;
+          MachineFunction *F = BB->getParent();
+          int FrameIdx = F->getFrameInfo()->CreateStackObject(Size, Size);
+          addFrameReference(BuildMI(BB, X86::FST64m, 5), FrameIdx).addReg(X86::FP0);
+          addFrameReference(BuildMI(BB, X86::MOVSDrm, 4, Result), FrameIdx);
+          break;
+        } else {
+          ContainsFPCode = true;
+          BuildMI(BB, X86::FpGETRESULT, 1, Result);
+          break;
+        }
       }
     return Result+N.ResNo-1;
   }
@@ -3977,7 +4184,15 @@
       case MVT::i8:  Opc = X86::MOV8rr; break;
       case MVT::i16: Opc = X86::MOV16rr; break;
       case MVT::i32: Opc = X86::MOV32rr; break;
-      case MVT::f64: Opc = X86::FpMOV; ContainsFPCode = true; break;
+      case MVT::f32: Opc = X86::MOVAPSrr; break;
+      case MVT::f64: 
+        if (X86ScalarSSE) {
+          Opc = X86::MOVAPDrr;
+        } else {
+          Opc = X86::FpMOV; 
+          ContainsFPCode = true; 
+        }
+        break;
       }
       BuildMI(BB, Opc, 1, Tmp2).addReg(Tmp1);
     }
@@ -4018,12 +4233,38 @@
       }
       switch (N.getOperand(1).getValueType()) {
       default: assert(0 && "All other types should have been promoted!!");
+      case MVT::f32:
+        if (X86ScalarSSE) {
+          // Spill the value to memory and reload it into top of stack.
+          unsigned Size = MVT::getSizeInBits(MVT::f32)/8;
+          MachineFunction *F = BB->getParent();
+          int FrameIdx = F->getFrameInfo()->CreateStackObject(Size, Size);
+          addFrameReference(BuildMI(BB, X86::MOVSSmr, 5), FrameIdx).addReg(Tmp1);
+          addFrameReference(BuildMI(BB, X86::FLD32m, 4, X86::FP0), FrameIdx);
+          BuildMI(BB, X86::FpSETRESULT, 1).addReg(X86::FP0);
+          ContainsFPCode = true; 
+        } else {
+          assert(0 && "MVT::f32 only legal with scalar sse fp");
+          abort();
+        }
+        break;
       case MVT::f64:
-	BuildMI(BB, X86::FpSETRESULT, 1).addReg(Tmp1);
-	break;
+        if (X86ScalarSSE) {
+          // Spill the value to memory and reload it into top of stack.
+          unsigned Size = MVT::getSizeInBits(MVT::f64)/8;
+          MachineFunction *F = BB->getParent();
+          int FrameIdx = F->getFrameInfo()->CreateStackObject(Size, Size);
+          addFrameReference(BuildMI(BB, X86::MOVSDmr, 5), FrameIdx).addReg(Tmp1);
+          addFrameReference(BuildMI(BB, X86::FLD64m, 4, X86::FP0), FrameIdx);
+          BuildMI(BB, X86::FpSETRESULT, 1).addReg(X86::FP0);
+          ContainsFPCode = true; 
+        } else {
+          BuildMI(BB, X86::FpSETRESULT, 1).addReg(Tmp1);
+        }
+        break;
       case MVT::i32:
-	BuildMI(BB, X86::MOV32rr, 1, X86::EAX).addReg(Tmp1);
-	break;
+        BuildMI(BB, X86::MOV32rr, 1, X86::EAX).addReg(Tmp1);
+        break;
       }
       break;
     case 1:
@@ -4144,7 +4385,9 @@
     switch (StoredTy) {
     default: assert(0 && "Cannot truncstore this type!");
     case MVT::i1: Opc = X86::MOV8mr; break;
-    case MVT::f32: Opc = X86::FST32m; break;
+    case MVT::f32:
+      assert(!X86ScalarSSE && "Cannot truncstore scalar SSE regs"); 
+      Opc = X86::FST32m; break;
     }
 
     std::vector<std::pair<unsigned, unsigned> > RP;
@@ -4176,7 +4419,6 @@
       case MVT::i8:  Opc = X86::MOV8mi; break;
       case MVT::i16: Opc = X86::MOV16mi; break;
       case MVT::i32: Opc = X86::MOV32mi; break;
-      case MVT::f64: break;
       }
       if (Opc) {
         if (getRegPressure(N.getOperand(0)) > getRegPressure(N.getOperand(2))) {
@@ -4215,7 +4457,8 @@
     case MVT::i8:  Opc = X86::MOV8mr; break;
     case MVT::i16: Opc = X86::MOV16mr; break;
     case MVT::i32: Opc = X86::MOV32mr; break;
-    case MVT::f64: Opc = X86::FST64m; break;
+    case MVT::f32: Opc = X86::MOVSSmr; break;
+    case MVT::f64: Opc = X86ScalarSSE ? X86::MOVSDmr : X86::FST64m; break;
     }
 
     std::vector<std::pair<unsigned, unsigned> > RP;


Index: llvm/lib/Target/X86/X86InstrInfo.cpp
diff -u llvm/lib/Target/X86/X86InstrInfo.cpp:1.38 llvm/lib/Target/X86/X86InstrInfo.cpp:1.39
--- llvm/lib/Target/X86/X86InstrInfo.cpp:1.38	Thu Apr 21 18:38:14 2005
+++ llvm/lib/Target/X86/X86InstrInfo.cpp	Wed Jul  6 13:59:04 2005
@@ -28,7 +28,7 @@
                                unsigned& destReg) const {
   MachineOpCode oc = MI.getOpcode();
   if (oc == X86::MOV8rr || oc == X86::MOV16rr || oc == X86::MOV32rr ||
-      oc == X86::FpMOV) {
+      oc == X86::FpMOV  || oc == X86::MOVAPDrr) {
       assert(MI.getNumOperands() == 2 &&
              MI.getOperand(0).isRegister() &&
              MI.getOperand(1).isRegister() &&


Index: llvm/lib/Target/X86/X86InstrInfo.h
diff -u llvm/lib/Target/X86/X86InstrInfo.h:1.46 llvm/lib/Target/X86/X86InstrInfo.h:1.47
--- llvm/lib/Target/X86/X86InstrInfo.h:1.46	Thu Apr 21 18:38:14 2005
+++ llvm/lib/Target/X86/X86InstrInfo.h	Wed Jul  6 13:59:04 2005
@@ -107,6 +107,10 @@
     DA = 5 << Op0Shift,   DB = 6 << Op0Shift,
     DC = 7 << Op0Shift,   DD = 8 << Op0Shift,
     DE = 9 << Op0Shift,   DF = 10 << Op0Shift,
+    
+    // XS, XD - These prefix codes are for single and double precision scalar
+    // floating point operations performed in the SSE registers.
+    XD = 11 << Op0Shift,   XS = 12 << Op0Shift,
 
     //===------------------------------------------------------------------===//
     // This two-bit field describes the size of an immediate operand.  Zero is


Index: llvm/lib/Target/X86/X86InstrInfo.td
diff -u llvm/lib/Target/X86/X86InstrInfo.td:1.128 llvm/lib/Target/X86/X86InstrInfo.td:1.129
--- llvm/lib/Target/X86/X86InstrInfo.td:1.128	Mon Jun 27 16:20:31 2005
+++ llvm/lib/Target/X86/X86InstrInfo.td	Wed Jul  6 13:59:04 2005
@@ -187,7 +187,8 @@
 //
 let isCall = 1 in
   // All calls clobber the non-callee saved registers...
-  let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0] in {
+  let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+              XMM0, XMM1, XMM2, XMM3] in {
     def CALLpcrel32 : I<0xE8, RawFrm, (ops calltarget:$dst), "call $dst">;
     def CALL32r     : I<0xFF, MRM2r, (ops R32:$dst), "call {*}$dst">;
     def CALL32m     : I<0xFF, MRM2m, (ops i32mem:$dst), "call {*}$dst">;
@@ -1436,6 +1437,23 @@
                 "cvtss2sd {$src, $dst|$dst, $src}">, XD;
 def CVTSS2SDrm: I<0x5A, MRMSrcMem, (ops R32:$dst, f32mem:$src),
                 "cvtss2sd {$src, $dst|$dst, $src}">, XD;
+def CVTSI2SSrr: I<0x2A, MRMSrcReg, (ops R32:$dst, RXMM:$src),
+                "cvtsi2ss {$src, $dst|$dst, $src}">, XS;
+def CVTSI2SSrm: I<0x2A, MRMSrcMem, (ops R32:$dst, f32mem:$src),
+                "cvtsi2ss {$src, $dst|$dst, $src}">, XS;
+def CVTSI2SDrr: I<0x2A, MRMSrcReg, (ops R32:$dst, RXMM:$src),
+                "cvtsi2sd {$src, $dst|$dst, $src}">, XD;
+def CVTSI2SDrm: I<0x2A, MRMSrcMem, (ops R32:$dst, f64mem:$src),
+                "cvtsi2sd {$src, $dst|$dst, $src}">, XD;
+
+def SQRTSSrm : I<0x51, MRMSrcMem, (ops RXMM:$dst, f32mem:$src),
+                "subss {$src, $dst|$dst, $src}">, XS;
+def SQRTSSrr : I<0x51, MRMSrcReg, (ops RXMM:$dst, RXMM:$src),
+                "subss {$src, $dst|$dst, $src}">, XS;
+def SQRTSDrm : I<0x51, MRMSrcMem, (ops RXMM:$dst, f64mem:$src),
+                "subsd {$src, $dst|$dst, $src}">, XD;
+def SQRTSDrr : I<0x51, MRMSrcReg, (ops RXMM:$dst, RXMM:$src),
+                "subsd {$src, $dst|$dst, $src}">, XD;
 
 def UCOMISDrr: I<0x2E, MRMSrcReg, (ops RXMM:$dst, RXMM:$src),
                 "ucomisd {$src, $dst|$dst, $src}">, TB, OpSize;


Index: llvm/lib/Target/X86/X86RegisterInfo.cpp
diff -u llvm/lib/Target/X86/X86RegisterInfo.cpp:1.105 llvm/lib/Target/X86/X86RegisterInfo.cpp:1.106
--- llvm/lib/Target/X86/X86RegisterInfo.cpp:1.105	Sun May 15 00:49:58 2005
+++ llvm/lib/Target/X86/X86RegisterInfo.cpp	Wed Jul  6 13:59:04 2005
@@ -52,6 +52,7 @@
   case 32: return 2;
   case 64: return 3;   // FP in 64-bit spill mode.
   case 80: return 4;   // FP in 80-bit spill mode.
+  case 128: return 5;  // XMM reg in 128 bit mode.
   }
 }
 
@@ -59,18 +60,24 @@
                                           MachineBasicBlock::iterator MI,
                                           unsigned SrcReg, int FrameIdx) const {
   static const unsigned Opcode[] =
-    { X86::MOV8mr, X86::MOV16mr, X86::MOV32mr, X86::FST64m, X86::FSTP80m };
+    { X86::MOV8mr, X86::MOV16mr, X86::MOV32mr, X86::FST64m, X86::FSTP80m,
+      X86::MOVAPDmr };
   unsigned Idx = getIdx(getSpillSize(SrcReg));
-  addFrameReference(BuildMI(MBB, MI, Opcode[Idx], 5), FrameIdx).addReg(SrcReg);
+  unsigned Opc = Opcode[Idx];
+  if (X86ScalarSSE && Opc == X86::FST64m) Opc = X86::MOVSDmr;
+  addFrameReference(BuildMI(MBB, MI, Opc, 5), FrameIdx).addReg(SrcReg);
 }
 
 void X86RegisterInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator MI,
                                            unsigned DestReg, int FrameIdx)const{
   static const unsigned Opcode[] =
-    { X86::MOV8rm, X86::MOV16rm, X86::MOV32rm, X86::FLD64m, X86::FLD80m };
+    { X86::MOV8rm, X86::MOV16rm, X86::MOV32rm, X86::FLD64m, X86::FLD80m,
+      X86::MOVAPDrm };
   unsigned Idx = getIdx(getSpillSize(DestReg));
-  addFrameReference(BuildMI(MBB, MI, Opcode[Idx], 4, DestReg), FrameIdx);
+  unsigned Opc = Opcode[Idx];
+  if (X86ScalarSSE && Opc == X86::FLD64m) Opc = X86::MOVSDrm;
+  addFrameReference(BuildMI(MBB, MI, Opc, 4, DestReg), FrameIdx);
 }
 
 void X86RegisterInfo::copyRegToReg(MachineBasicBlock &MBB,
@@ -78,8 +85,11 @@
                                    unsigned DestReg, unsigned SrcReg,
                                    const TargetRegisterClass *RC) const {
   static const unsigned Opcode[] =
-    { X86::MOV8rr, X86::MOV16rr, X86::MOV32rr, X86::FpMOV, X86::FpMOV };
-  BuildMI(MBB, MI, Opcode[getIdx(RC->getSize()*8)], 1, DestReg).addReg(SrcReg);
+    { X86::MOV8rr, X86::MOV16rr, X86::MOV32rr, X86::FpMOV, X86::FpMOV,
+      X86::MOVAPDrr };
+  unsigned Opc = Opcode[getIdx(RC->getSize()*8)];
+  if (X86ScalarSSE && Opc == X86::FpMOV) Opc = X86::MOVAPDrr;
+  BuildMI(MBB, MI, Opc, 1, DestReg).addReg(SrcReg);
 }
 
 static MachineInstr *MakeMInst(unsigned Opcode, unsigned FrameIndex,


Index: llvm/lib/Target/X86/X86RegisterInfo.td
diff -u llvm/lib/Target/X86/X86RegisterInfo.td:1.18 llvm/lib/Target/X86/X86RegisterInfo.td:1.19
--- llvm/lib/Target/X86/X86RegisterInfo.td:1.18	Mon Jun 27 16:20:31 2005
+++ llvm/lib/Target/X86/X86RegisterInfo.td	Wed Jul  6 13:59:04 2005
@@ -99,8 +99,8 @@
 // FIXME: These registers can contain both integer and fp values.  We should
 // figure out the right way to deal with that.  For now, since they'll be used
 // for scalar FP, they are being declared f64
-def RXMM : RegisterClass<f64, 128, [XMM0, XMM1, XMM2, XMM3, 
-                                    XMM4, XMM5, XMM6, XMM7]>;
+def RXMM : RegisterClass<f64, 32, [XMM0, XMM1, XMM2, XMM3, 
+                                   XMM4, XMM5, XMM6, XMM7]>;
 
 // FIXME: This sets up the floating point register files as though they are f64
 // values, though they really are f80 values.  This will cause us to spill


Index: llvm/lib/Target/X86/X86TargetMachine.cpp
diff -u llvm/lib/Target/X86/X86TargetMachine.cpp:1.80 llvm/lib/Target/X86/X86TargetMachine.cpp:1.81
--- llvm/lib/Target/X86/X86TargetMachine.cpp:1.80	Mon Jun 27 01:30:12 2005
+++ llvm/lib/Target/X86/X86TargetMachine.cpp	Wed Jul  6 13:59:04 2005
@@ -26,6 +26,7 @@
 using namespace llvm;
 
 X86VectorEnum llvm::X86Vector = NoSSE;
+bool llvm::X86ScalarSSE = false;
 
 /// X86TargetMachineModule - Note that this is used on hosts that cannot link
 /// in a library unless there are references into the library.  In particular,
@@ -41,8 +42,11 @@
   cl::opt<bool> DisableOutput("disable-x86-llc-output", cl::Hidden,
                               cl::desc("Disable the X86 asm printer, for use "
                                        "when profiling the code generator."));
+  cl::opt<bool, true> EnableSSEFP("enable-sse-scalar-fp",
+                cl::desc("Perform FP math in SSE regs instead of the FP stack"),
+                cl::location(X86ScalarSSE),
+                cl::init(false));
 
-#if 0
   // FIXME: This should eventually be handled with target triples and
   // subtarget support!
   cl::opt<X86VectorEnum, true>
@@ -54,7 +58,6 @@
        clEnumValN(SSE3, "sse3", "  Enable SSE, SSE2, and SSE3 support"),
        clEnumValEnd),
     cl::location(X86Vector), cl::init(NoSSE));
-#endif
 
   // Register the target.
   RegisterTarget<X86TargetMachine> X("x86", "  IA-32 (Pentium and above)");
@@ -91,6 +94,8 @@
   : TargetMachine("X86", IL, true, 4, 4, 4, 4, 4),
     FrameInfo(TargetFrameInfo::StackGrowsDown, 8, -4),
     JITInfo(*this) {
+  // Scalar SSE FP requires at least SSE2
+  X86ScalarSSE &= X86Vector >= SSE2;
 }