[llvm-commits] CVS: llvm/lib/Target/X86/InstSelectSimple.cpp

Wed Feb 25 00:14:03 PST 2004

Changes in directory llvm/lib/Target/X86:

InstSelectSimple.cpp updated: 1.176 -> 1.177

---
Log message:

* Make the previous patch more efficient by not allocating a temporary MachineInstr
  to do analysis.

*** FOLD getelementptr instructions into loads and stores when possible, 
    making use of some of the crazy X86 addressing modes.

For example, the following C++ program fragment:

struct complex {
    double re, im;
    complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
    return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
    return arg + complex(1,0);
}

Used to be compiled to:
_Z6addoneRK7complex:
        mov %EAX, DWORD PTR [%ESP + 4]
        mov %ECX, DWORD PTR [%ESP + 8]
***     mov %EDX, %ECX                       
        fld QWORD PTR [%EDX]
        fld1 
        faddp %ST(1)
***     add %ECX, 8
        fld QWORD PTR [%ECX]
        fldz 
        faddp %ST(1)
***     mov %ECX, %EAX
        fxch %ST(1)
        fstp QWORD PTR [%ECX]
***     add %EAX, 8
        fstp QWORD PTR [%EAX]
        ret 

Now it is compiled to:
_Z6addoneRK7complex:
        mov %EAX, DWORD PTR [%ESP + 4]
        mov %ECX, DWORD PTR [%ESP + 8]
        fld QWORD PTR [%ECX]
        fld1 
        faddp %ST(1)
        fld QWORD PTR [%ECX + 8]
        fldz 
        faddp %ST(1)
        fxch %ST(1)
        fstp QWORD PTR [%EAX]
        fstp QWORD PTR [%EAX + 8]
        ret 

Other programs should see similar improvements, across the board.  Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86.  :)



---
Diffs of the changes:  (+184 -56)

Index: llvm/lib/Target/X86/InstSelectSimple.cpp
diff -u llvm/lib/Target/X86/InstSelectSimple.cpp:1.176 llvm/lib/Target/X86/InstSelectSimple.cpp:1.177

--- llvm/lib/Target/X86/InstSelectSimple.cpp:1.176	Tue Feb 24 21:45:50 2004
+++ llvm/lib/Target/X86/InstSelectSimple.cpp	Wed Feb 25 00:13:04 2004
@@ -222,6 +222,20 @@
     ///
     void promote32(unsigned targetReg, const ValueRecord &VR);
 
+    // getGEPIndex - This is used to fold GEP instructions into X86 addressing
+    // expressions.
+    void getGEPIndex(MachineBasicBlock *MBB, MachineBasicBlock::iterator IP,
+                     std::vector<Value*> &GEPOps,
+                     std::vector<const Type*> &GEPTypes, unsigned &BaseReg,
+                     unsigned &Scale, unsigned &IndexReg, unsigned &Disp);
+
+    /// isGEPFoldable - Return true if the specified GEP can be completely
+    /// folded into the addressing mode of a load/store or lea instruction.
+    bool isGEPFoldable(MachineBasicBlock *MBB,
+                       Value *Src, User::op_iterator IdxBegin,
+                       User::op_iterator IdxEnd, unsigned &BaseReg,
+                       unsigned &Scale, unsigned &IndexReg, unsigned &Disp);
+
     /// emitGEPOperation - Common code shared between visitGetElementPtrInst and
     /// constant expression GEP support.
     ///
@@ -1884,14 +1898,32 @@
 /// need to worry about the memory layout of the target machine.
 ///
 void ISel::visitLoadInst(LoadInst &I) {
-  unsigned SrcAddrReg = getReg(I.getOperand(0));
   unsigned DestReg = getReg(I);
+  unsigned BaseReg = 0, Scale = 1, IndexReg = 0, Disp = 0;
+  Value *Addr = I.getOperand(0);
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Addr)) {
+    if (isGEPFoldable(BB, GEP->getOperand(0), GEP->op_begin()+1, GEP->op_end(),
+                       BaseReg, Scale, IndexReg, Disp))
+      Addr = 0;  // Address is consumed!
+  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {
+    if (CE->getOpcode() == Instruction::GetElementPtr)
+      if (isGEPFoldable(BB, CE->getOperand(0), CE->op_begin()+1, CE->op_end(),
+                        BaseReg, Scale, IndexReg, Disp))
+        Addr = 0;
+  }
+
+  if (Addr) {
+    // If it's not foldable, reset addr mode.
+    BaseReg = getReg(Addr);
+    Scale = 1; IndexReg = 0; Disp = 0;
+  }
 
   unsigned Class = getClassB(I.getType());
-
   if (Class == cLong) {
-    addDirectMem(BuildMI(BB, X86::MOVrm32, 4, DestReg), SrcAddrReg);
-    addRegOffset(BuildMI(BB, X86::MOVrm32, 4, DestReg+1), SrcAddrReg, 4);
+    addFullAddress(BuildMI(BB, X86::MOVrm32, 4, DestReg),
+                   BaseReg, Scale, IndexReg, Disp);
+    addFullAddress(BuildMI(BB, X86::MOVrm32, 4, DestReg+1),
+                   BaseReg, Scale, IndexReg, Disp+4);
     return;
   }
 
@@ -1900,37 +1932,61 @@
   };
   unsigned Opcode = Opcodes[Class];
   if (I.getType() == Type::DoubleTy) Opcode = X86::FLDr64;
-  addDirectMem(BuildMI(BB, Opcode, 4, DestReg), SrcAddrReg);
+  addFullAddress(BuildMI(BB, Opcode, 4, DestReg),
+                 BaseReg, Scale, IndexReg, Disp);
 }
 
 /// visitStoreInst - Implement LLVM store instructions in terms of the x86 'mov'
 /// instruction.
 ///
 void ISel::visitStoreInst(StoreInst &I) {
-  unsigned AddressReg  = getReg(I.getOperand(1));
+  unsigned BaseReg = 0, Scale = 1, IndexReg = 0, Disp = 0;
+  Value *Addr = I.getOperand(1);
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Addr)) {
+    if (isGEPFoldable(BB, GEP->getOperand(0), GEP->op_begin()+1, GEP->op_end(),
+                       BaseReg, Scale, IndexReg, Disp))
+      Addr = 0;  // Address is consumed!
+  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {
+    if (CE->getOpcode() == Instruction::GetElementPtr)
+      if (isGEPFoldable(BB, CE->getOperand(0), CE->op_begin()+1, CE->op_end(),
+                        BaseReg, Scale, IndexReg, Disp))
+        Addr = 0;
+  }
+
+  if (Addr) {
+    // If it's not foldable, reset addr mode.
+    BaseReg = getReg(Addr);
+    Scale = 1; IndexReg = 0; Disp = 0;
+  }
+
   const Type *ValTy = I.getOperand(0)->getType();
   unsigned Class = getClassB(ValTy);
 
   if (ConstantInt *CI = dyn_cast<ConstantInt>(I.getOperand(0))) {
     uint64_t Val = CI->getRawValue();
     if (Class == cLong) {
-      addDirectMem(BuildMI(BB, X86::MOVmi32, 5), AddressReg).addZImm(Val & ~0U);
-      addRegOffset(BuildMI(BB, X86::MOVmi32, 5), AddressReg,4).addZImm(Val>>32);
+      addFullAddress(BuildMI(BB, X86::MOVmi32, 5),
+                     BaseReg, Scale, IndexReg, Disp).addZImm(Val & ~0U);
+      addFullAddress(BuildMI(BB, X86::MOVmi32, 5),
+                     BaseReg, Scale, IndexReg, Disp+4).addZImm(Val>>32);
     } else {
       static const unsigned Opcodes[] = {
         X86::MOVmi8, X86::MOVmi16, X86::MOVmi32
       };
       unsigned Opcode = Opcodes[Class];
-      addDirectMem(BuildMI(BB, Opcode, 5), AddressReg).addZImm(Val);
+      addFullAddress(BuildMI(BB, Opcode, 5),
+                     BaseReg, Scale, IndexReg, Disp).addZImm(Val);
     }
   } else if (ConstantBool *CB = dyn_cast<ConstantBool>(I.getOperand(0))) {
-    addDirectMem(BuildMI(BB, X86::MOVmi8, 5),
-                 AddressReg).addZImm(CB->getValue());
+    addFullAddress(BuildMI(BB, X86::MOVmi8, 5),
+                   BaseReg, Scale, IndexReg, Disp).addZImm(CB->getValue());
   } else {    
     if (Class == cLong) {
       unsigned ValReg = getReg(I.getOperand(0));
-      addDirectMem(BuildMI(BB, X86::MOVmr32, 5), AddressReg).addReg(ValReg);
-      addRegOffset(BuildMI(BB, X86::MOVmr32, 5), AddressReg,4).addReg(ValReg+1);
+      addFullAddress(BuildMI(BB, X86::MOVmr32, 5),
+                     BaseReg, Scale, IndexReg, Disp).addReg(ValReg);
+      addFullAddress(BuildMI(BB, X86::MOVmr32, 5),
+                     BaseReg, Scale, IndexReg, Disp+4).addReg(ValReg+1);
     } else {
       unsigned ValReg = getReg(I.getOperand(0));
       static const unsigned Opcodes[] = {
@@ -1938,7 +1994,8 @@
       };
       unsigned Opcode = Opcodes[Class];
       if (ValTy == Type::DoubleTy) Opcode = X86::FSTr64;
-      addDirectMem(BuildMI(BB, Opcode, 1+4), AddressReg).addReg(ValReg);
+      addFullAddress(BuildMI(BB, Opcode, 1+4),
+                     BaseReg, Scale, IndexReg, Disp).addReg(ValReg);
     }
   }
 }
@@ -2138,7 +2195,8 @@
     }
 
     // Spill the integer to memory and reload it from there...
-    int FrameIdx = F->getFrameInfo()->CreateStackObject(SrcTy, TM.getTargetData());
+    int FrameIdx =
+      F->getFrameInfo()->CreateStackObject(SrcTy, TM.getTargetData());
 
     if (SrcClass == cLong) {
       addFrameReference(BMI(BB, IP, X86::MOVmr32, 5), FrameIdx).addReg(SrcReg);
@@ -2160,15 +2218,18 @@
       // Emit a test instruction to see if the dynamic input value was signed.
       BMI(BB, IP, X86::TESTrr32, 2).addReg(SrcReg+1).addReg(SrcReg+1);
 
-      // If the sign bit is set, get a pointer to an offset, otherwise get a pointer to a zero.
+      // If the sign bit is set, get a pointer to an offset, otherwise get a
+      // pointer to a zero.
       MachineConstantPool *CP = F->getConstantPool();
       unsigned Zero = makeAnotherReg(Type::IntTy);
+      Constant *Null = Constant::getNullValue(Type::UIntTy);
       addConstantPoolReference(BMI(BB, IP, X86::LEAr32, 5, Zero), 
-                               CP->getConstantPoolIndex(Constant::getNullValue(Type::UIntTy)));
+                               CP->getConstantPoolIndex(Null));
       unsigned Offset = makeAnotherReg(Type::IntTy);
+      Constant *OffsetCst = ConstantUInt::get(Type::UIntTy, 0x5f800000);
+                                             
       addConstantPoolReference(BMI(BB, IP, X86::LEAr32, 5, Offset),
-                               CP->getConstantPoolIndex(ConstantUInt::get(Type::UIntTy,
-                                                                          0x5f800000)));
+                               CP->getConstantPoolIndex(OffsetCst));
       unsigned Addr = makeAnotherReg(Type::IntTy);
       BMI(BB, IP, X86::CMOVSrr32, 2, Addr).addReg(Zero).addReg(Offset);
 
@@ -2303,6 +2364,26 @@
 
 
 void ISel::visitGetElementPtrInst(GetElementPtrInst &I) {
+  // If this GEP instruction will be folded into all of its users, we don't need
+  // to explicitly calculate it!
+  unsigned A, B, C, D;
+  if (isGEPFoldable(0, I.getOperand(0), I.op_begin()+1, I.op_end(), A,B,C,D)) {
+    // Check all of the users of the instruction to see if they are loads and
+    // stores.
+    bool AllWillFold = true;
+    for (Value::use_iterator UI = I.use_begin(), E = I.use_end(); UI != E; ++UI)
+      if (cast<Instruction>(*UI)->getOpcode() != Instruction::Load)
+        if (cast<Instruction>(*UI)->getOpcode() != Instruction::Store ||
+            cast<Instruction>(*UI)->getOperand(0) == &I) {
+          AllWillFold = false;
+          break;
+        }
+
+    // If the instruction is foldable, and will be folded into all users, don't
+    // emit it!
+    if (AllWillFold) return;
+  }
+
   unsigned outputReg = getReg(I);
   emitGEPOperation(BB, BB->end(), I.getOperand(0),
                    I.op_begin()+1, I.op_end(), outputReg);
@@ -2319,15 +2400,18 @@
 ///
 /// Note that there is one fewer entry in GEPTypes than there is in GEPOps.
 ///
-static void getGEPIndex(std::vector<Value*> &GEPOps,
-                        std::vector<const Type*> &GEPTypes,
-                        MachineInstr *Ops, const TargetData &TD){
+void ISel::getGEPIndex(MachineBasicBlock *MBB, MachineBasicBlock::iterator IP,
+                       std::vector<Value*> &GEPOps,
+                       std::vector<const Type*> &GEPTypes, unsigned &BaseReg,
+                       unsigned &Scale, unsigned &IndexReg, unsigned &Disp) {
+  const TargetData &TD = TM.getTargetData();
+
   // Clear out the state we are working with...
-  Ops->getOperand(0).setReg(0);           // No base register
-  Ops->getOperand(1).setImmedValue(1);    // Unit scale
-  Ops->getOperand(2).setReg(0);           // No index register
-  Ops->getOperand(3).setImmedValue(0);    // No displacement
-  
+  BaseReg = 0;    // No base register
+  Scale = 1;      // Unit scale
+  IndexReg = 0;   // No index register
+  Disp = 0;       // No displacement
+
   // While there are GEP indexes that can be folded into the current address,
   // keep processing them.
   while (!GEPTypes.empty()) {
@@ -2340,14 +2424,7 @@
       // structure is in memory.  Since the structure index must be constant, we
       // can get its value and use it to find the right byte offset from the
       // StructLayout class's list of structure member offsets.
-      unsigned idxValue = CUI->getValue();
-      unsigned FieldOff = TD.getStructLayout(StTy)->MemberOffsets[idxValue];
-      if (FieldOff) {
-        if (Ops->getOperand(2).getReg())
-          return;  // Already has an index, can't add offset.
-        Ops->getOperand(3).setImmedValue(FieldOff+
-                                         Ops->getOperand(3).getImmedValue());
-      }
+      Disp += TD.getStructLayout(StTy)->MemberOffsets[CUI->getValue()];
       GEPOps.pop_back();        // Consume a GEP operand
       GEPTypes.pop_back();
     } else {
@@ -2362,10 +2439,7 @@
 
       // If idx is a constant, fold it into the offset.
       if (ConstantSInt *CSI = dyn_cast<ConstantSInt>(idx)) {
-        unsigned elementSize = TD.getTypeSize(SqTy->getElementType());
-        unsigned Offset = elementSize*CSI->getValue();
-        Ops->getOperand(3).setImmedValue(Offset+
-                                         Ops->getOperand(3).getImmedValue());
+        Disp += TD.getTypeSize(SqTy->getElementType())*CSI->getValue();
       } else {
         // If we can't handle it, return.
         return;
@@ -2375,15 +2449,49 @@
       GEPTypes.pop_back();
     }
   }
+
+  // GEPTypes is empty, which means we have a single operand left.  See if we
+  // can set it as the base register.
+  //
+  // FIXME: When addressing modes are more powerful/correct, we could load
+  // global addresses directly as 32-bit immediates.
+  assert(BaseReg == 0);
+  BaseReg = MBB ? getReg(GEPOps[0], MBB, IP) : 0;
+  GEPOps.pop_back();        // Consume the last GEP operand
 }
 
 
+/// isGEPFoldable - Return true if the specified GEP can be completely
+/// folded into the addressing mode of a load/store or lea instruction.
+bool ISel::isGEPFoldable(MachineBasicBlock *MBB,
+                         Value *Src, User::op_iterator IdxBegin,
+                         User::op_iterator IdxEnd, unsigned &BaseReg,
+                         unsigned &Scale, unsigned &IndexReg, unsigned &Disp) {
+  if (ConstantPointerRef *CPR = dyn_cast<ConstantPointerRef>(Src))
+    Src = CPR->getValue();
+
+  std::vector<Value*> GEPOps;
+  GEPOps.resize(IdxEnd-IdxBegin+1);
+  GEPOps[0] = Src;
+  std::copy(IdxBegin, IdxEnd, GEPOps.begin()+1);
+  
+  std::vector<const Type*> GEPTypes;
+  GEPTypes.assign(gep_type_begin(Src->getType(), IdxBegin, IdxEnd),
+                  gep_type_end(Src->getType(), IdxBegin, IdxEnd));
+
+  MachineBasicBlock::iterator IP;
+  if (MBB) IP = MBB->end();
+  getGEPIndex(MBB, IP, GEPOps, GEPTypes, BaseReg, Scale, IndexReg, Disp);
+
+  // We can fold it away iff the getGEPIndex call eliminated all operands.
+  return GEPOps.empty();
+}
+
 void ISel::emitGEPOperation(MachineBasicBlock *MBB,
                             MachineBasicBlock::iterator IP,
                             Value *Src, User::op_iterator IdxBegin,
                             User::op_iterator IdxEnd, unsigned TargetReg) {
   const TargetData &TD = TM.getTargetData();
-
   if (ConstantPointerRef *CPR = dyn_cast<ConstantPointerRef>(Src))
     Src = CPR->getValue();
 
@@ -2396,27 +2504,28 @@
   GEPTypes.assign(gep_type_begin(Src->getType(), IdxBegin, IdxEnd),
                   gep_type_end(Src->getType(), IdxBegin, IdxEnd));
 
-  // DummyMI - A dummy instruction to pass into getGEPIndex.  The opcode doesn't
-  // matter, we just need 4 MachineOperands.
-  MachineInstr *DummyMI =
-    BuildMI(X86::PHI, 4).addReg(0).addZImm(1).addReg(0).addSImm(0);
-
   // Keep emitting instructions until we consume the entire GEP instruction.
   while (!GEPOps.empty()) {
     unsigned OldSize = GEPOps.size();
-    getGEPIndex(GEPOps, GEPTypes, DummyMI, TD);
+    unsigned BaseReg, Scale, IndexReg, Disp;
+    getGEPIndex(MBB, IP, GEPOps, GEPTypes, BaseReg, Scale, IndexReg, Disp);
     
     if (GEPOps.size() != OldSize) {
       // getGEPIndex consumed some of the input.  Build an LEA instruction here.
-      assert(DummyMI->getOperand(0).getReg() == 0 &&
-             DummyMI->getOperand(1).getImmedValue() == 1 &&
-             DummyMI->getOperand(2).getReg() == 0 &&
-             "Unhandled GEP fold!");
-      if (unsigned Offset = DummyMI->getOperand(3).getImmedValue()) {
-        unsigned Reg = makeAnotherReg(Type::UIntTy);
-        addRegOffset(BMI(MBB, IP, X86::LEAr32, 5, TargetReg), Reg, Offset);
-        TargetReg = Reg;
+      unsigned NextTarget = 0;
+      if (!GEPOps.empty()) {
+        assert(BaseReg == 0 &&
+           "getGEPIndex should have left the base register open for chaining!");
+        NextTarget = BaseReg = makeAnotherReg(Type::UIntTy);
       }
+
+      if (IndexReg == 0 && Disp == 0)
+        BMI(MBB, IP, X86::MOVrr32, 1, TargetReg).addReg(BaseReg);
+      else
+        addFullAddress(BMI(MBB, IP, X86::LEAr32, 5, TargetReg),
+                       BaseReg, Scale, IndexReg, Disp);
+      --IP;
+      TargetReg = NextTarget;
     } else if (GEPTypes.empty()) {
       // The getGEPIndex operation didn't want to build an LEA.  Check to see if
       // all operands are consumed but the base pointer.  If so, just load it
@@ -2428,6 +2537,27 @@
         BMI(MBB, IP, X86::MOVrr32, 1, TargetReg).addReg(BaseReg);
       }
       break;                // we are now done
+
+    } else if (const StructType *StTy = dyn_cast<StructType>(GEPTypes.back())) {
+      // It's a struct access.  CUI is the index into the structure,
+      // which names the field. This index must have unsigned type.
+      const ConstantUInt *CUI = cast<ConstantUInt>(GEPOps.back());
+      GEPOps.pop_back();        // Consume a GEP operand
+      GEPTypes.pop_back();
+
+      // Use the TargetData structure to pick out what the layout of the
+      // structure is in memory.  Since the structure index must be constant, we
+      // can get its value and use it to find the right byte offset from the
+      // StructLayout class's list of structure member offsets.
+      unsigned idxValue = CUI->getValue();
+      unsigned FieldOff = TD.getStructLayout(StTy)->MemberOffsets[idxValue];
+      if (FieldOff) {
+        unsigned Reg = makeAnotherReg(Type::UIntTy);
+        // Emit an ADD to add FieldOff to the basePtr.
+        BMI(MBB, IP, X86::ADDri32, 2, TargetReg).addReg(Reg).addZImm(FieldOff);
+        --IP;            // Insert the next instruction before this one.
+        TargetReg = Reg; // Codegen the rest of the GEP into this
+      }
     } else {
       // It's an array or pointer access: [ArraySize x ElementType].
       const SequentialType *SqTy = cast<SequentialType>(GEPTypes.back());
@@ -2496,8 +2626,6 @@
       }
     }
   }
-
-  delete DummyMI;
 }