[llvm-commits] [llvm] r144602 - in /llvm/trunk: include/llvm/Target/TargetInstrInfo.h lib/CodeGen/ExecutionDepsFix.cpp lib/Target/X86/X86InstrInfo.cpp lib/Target/X86/X86InstrInfo.h test/CodeGen/X86/sse-domains.ll

Jakob Stoklund Olesen stoklund at 2pi.dk
Mon Nov 14 17:15:30 PST 2011


Author: stoklund
Date: Mon Nov 14 19:15:30 2011
New Revision: 144602

URL: http://llvm.org/viewvc/llvm-project?rev=144602&view=rev
Log:
Break false dependencies before partial register updates.

Two new TargetInstrInfo hooks lets the target tell ExecutionDepsFix
about instructions with partial register updates causing false unwanted
dependencies.

The ExecutionDepsFix pass will break the false dependencies if the
updated register was written in the previoius N instructions.

The small loop added to sse-domains.ll runs twice as fast with
dependency-breaking instructions inserted.

Modified:
    llvm/trunk/include/llvm/Target/TargetInstrInfo.h
    llvm/trunk/lib/CodeGen/ExecutionDepsFix.cpp
    llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
    llvm/trunk/lib/Target/X86/X86InstrInfo.h
    llvm/trunk/test/CodeGen/X86/sse-domains.ll

Modified: llvm/trunk/include/llvm/Target/TargetInstrInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Target/TargetInstrInfo.h?rev=144602&r1=144601&r2=144602&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Target/TargetInstrInfo.h (original)
+++ llvm/trunk/include/llvm/Target/TargetInstrInfo.h Mon Nov 14 19:15:30 2011
@@ -718,6 +718,74 @@
   ///
   virtual void setExecutionDomain(MachineInstr *MI, unsigned Domain) const {}
 
+
+  /// getPartialRegUpdateClearance - Returns the preferred minimum clearance
+  /// before an instruction with an unwanted partial register update.
+  ///
+  /// Some instructions only write part of a register, and implicitly need to
+  /// read the other parts of the register.  This may cause unwanted stalls
+  /// preventing otherwise unrelated instructions from executing in parallel in
+  /// an out-of-order CPU.
+  ///
+  /// For example, the x86 instruction cvtsi2ss writes its result to bits
+  /// [31:0] of the destination xmm register. Bits [127:32] are unaffected, so
+  /// the instruction needs to wait for the old value of the register to become
+  /// available:
+  ///
+  ///   addps %xmm1, %xmm0
+  ///   movaps %xmm0, (%rax)
+  ///   cvtsi2ss %rbx, %xmm0
+  ///
+  /// In the code above, the cvtsi2ss instruction needs to wait for the addps
+  /// instruction before it can issue, even though the high bits of %xmm0
+  /// probably aren't needed.
+  ///
+  /// This hook returns the preferred clearance before MI, measured in
+  /// instructions.  Other defs of MI's operand OpNum are avoided in the last N
+  /// instructions before MI.  It should only return a positive value for
+  /// unwanted dependencies.  If the old bits of the defined register have
+  /// useful values, or if MI is determined to otherwise read the dependency,
+  /// the hook should return 0.
+  ///
+  /// The unwanted dependency may be handled by:
+  ///
+  /// 1. Allocating the same register for an MI def and use.  That makes the
+  ///    unwanted dependency identical to a required dependency.
+  ///
+  /// 2. Allocating a register for the def that has no defs in the previous N
+  ///    instructions.
+  ///
+  /// 3. Calling breakPartialRegDependency() with the same arguments.  This
+  ///    allows the target to insert a dependency breaking instruction.
+  ///
+  virtual unsigned
+  getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
+                               const TargetRegisterInfo *TRI) const {
+    // The default implementation returns 0 for no partial register dependency.
+    return 0;
+  }
+
+  /// breakPartialRegDependency - Insert a dependency-breaking instruction
+  /// before MI to eliminate an unwanted dependency on OpNum.
+  ///
+  /// If it wasn't possible to avoid a def in the last N instructions before MI
+  /// (see getPartialRegUpdateClearance), this hook will be called to break the
+  /// unwanted dependency.
+  ///
+  /// On x86, an xorps instruction can be used as a dependency breaker:
+  ///
+  ///   addps %xmm1, %xmm0
+  ///   movaps %xmm0, (%rax)
+  ///   xorps %xmm0, %xmm0
+  ///   cvtsi2ss %rbx, %xmm0
+  ///
+  /// An <imp-kill> operand should be added to MI if an instruction was
+  /// inserted.  This ties the instructions together in the post-ra scheduler.
+  ///
+  virtual void
+  breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
+                            const TargetRegisterInfo *TRI) const {}
+
 private:
   int CallFrameSetupOpcode, CallFrameDestroyOpcode;
 };

Modified: llvm/trunk/lib/CodeGen/ExecutionDepsFix.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/ExecutionDepsFix.cpp?rev=144602&r1=144601&r2=144602&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/ExecutionDepsFix.cpp (original)
+++ llvm/trunk/lib/CodeGen/ExecutionDepsFix.cpp Mon Nov 14 19:15:30 2011
@@ -471,11 +471,34 @@
     DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr
                  << '\t' << *MI);
 
+    // How many instructions since rx was last written?
+    unsigned Clearance = CurInstr - LiveRegs[rx].Def;
     LiveRegs[rx].Def = CurInstr;
 
     // Kill off domains redefined by generic instructions.
     if (Kill)
       kill(rx);
+
+    // Verify clearance before partial register updates.
+    unsigned Pref = TII->getPartialRegUpdateClearance(MI, i, TRI);
+    if (!Pref)
+      continue;
+    DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref);
+    if (Pref > Clearance) {
+      DEBUG(dbgs() << ": Break dependency.\n");
+      TII->breakPartialRegDependency(MI, i, TRI);
+      continue;
+    }
+
+    // The current clearance seems OK, but we may be ignoring a def from a
+    // back-edge.
+    if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) {
+      DEBUG(dbgs() << ": OK.\n");
+      continue;
+    }
+
+    // A def from an unprocessed back-edge may make us break this dependency.
+    DEBUG(dbgs() << ": Wait for back-edge to resolve.\n");
   }
 
   ++CurInstr;
@@ -663,6 +686,10 @@
   for (unsigned i = 0, e = Loops.size(); i != e; ++i) {
     MachineBasicBlock *MBB = Loops[i];
     enterBasicBlock(MBB);
+    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+        ++I)
+      if (!I->isDebugValue())
+        processDefs(I, false);
     leaveBasicBlock(MBB);
   }
 

Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.cpp?rev=144602&r1=144601&r2=144602&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp Mon Nov 14 19:15:30 2011
@@ -2761,6 +2761,10 @@
 ///
 static bool hasPartialRegUpdate(unsigned Opcode) {
   switch (Opcode) {
+  case X86::CVTSI2SSrr:
+  case X86::CVTSI2SS64rr:
+  case X86::CVTSI2SDrr:
+  case X86::CVTSI2SD64rr:
   case X86::CVTSD2SSrr:
   case X86::Int_CVTSD2SSrr:
   case X86::CVTSS2SDrr:
@@ -2789,6 +2793,54 @@
   return false;
 }
 
+/// getPartialRegUpdateClearance - Inform the ExeDepsFix pass how many idle
+/// instructions we would like before a partial register update.
+unsigned X86InstrInfo::
+getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
+                             const TargetRegisterInfo *TRI) const {
+  if (OpNum != 0 || !hasPartialRegUpdate(MI->getOpcode()))
+    return 0;
+
+  // If MI is marked as reading Reg, the partial register update is wanted.
+  const MachineOperand &MO = MI->getOperand(0);
+  unsigned Reg = MO.getReg();
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    if (MO.readsReg() || MI->readsVirtualRegister(Reg))
+      return 0;
+  } else {
+    if (MI->readsRegister(Reg, TRI))
+      return 0;
+  }
+
+  // If any of the preceding 16 instructions are reading Reg, insert a
+  // dependency breaking instruction.  The magic number is based on a few
+  // Nehalem experiments.
+  return 16;
+}
+
+void X86InstrInfo::
+breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
+                          const TargetRegisterInfo *TRI) const {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  if (X86::VR128RegClass.contains(Reg)) {
+    // These instructions are all floating point domain, so xorps is the best
+    // choice.
+    bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+    unsigned Opc = HasAVX ? X86::VXORPSrr : X86::XORPSrr;
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg)
+      .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+  } else if (X86::VR256RegClass.contains(Reg)) {
+    // Use vxorps to clear the full ymm register.
+    // It wants to read and write the xmm sub-register.
+    unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(X86::VXORPSrr), XReg)
+      .addReg(XReg, RegState::Undef).addReg(XReg, RegState::Undef)
+      .addReg(Reg, RegState::ImplicitDefine);
+  } else
+    return;
+  MI->addRegisterKilled(Reg, TRI, true);
+}
+
 MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                                   MachineInstr *MI,
                                            const SmallVectorImpl<unsigned> &Ops,

Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.h?rev=144602&r1=144601&r2=144602&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrInfo.h (original)
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.h Mon Nov 14 19:15:30 2011
@@ -345,6 +345,11 @@
 
   void setExecutionDomain(MachineInstr *MI, unsigned Domain) const;
 
+  unsigned getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
+                                        const TargetRegisterInfo *TRI) const;
+  void breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
+                                 const TargetRegisterInfo *TRI) const;
+
   MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
                                       MachineInstr* MI,
                                       unsigned OpNum,

Modified: llvm/trunk/test/CodeGen/X86/sse-domains.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse-domains.ll?rev=144602&r1=144601&r2=144602&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse-domains.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse-domains.ll Mon Nov 14 19:15:30 2011
@@ -43,3 +43,44 @@
 while.end:
   ret void
 }
+
+; CHECK: f2
+;
+; This loop contains two cvtsi2ss instructions that update the same xmm
+; register.  Verify that the execution dependency fix pass breaks those
+; dependencies by inserting xorps instructions.
+;
+; If the register allocator chooses different registers for the two cvtsi2ss
+; instructions, they are still dependent on themselves.
+; CHECK: xorps [[XMM1:%xmm[0-9]+]]
+; CHECK: , [[XMM1]]
+; CHECK: cvtsi2ss %{{.*}}, [[XMM1]]
+; CHECK: xorps [[XMM2:%xmm[0-9]+]]
+; CHECK: , [[XMM2]]
+; CHECK: cvtsi2ss %{{.*}}, [[XMM2]]
+;
+define float @f2(i32 %m) nounwind uwtable readnone ssp {
+entry:
+  %tobool3 = icmp eq i32 %m, 0
+  br i1 %tobool3, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %m.addr.07 = phi i32 [ %dec, %for.body ], [ %m, %entry ]
+  %s1.06 = phi float [ %add, %for.body ], [ 0.000000e+00, %entry ]
+  %s2.05 = phi float [ %add2, %for.body ], [ 0.000000e+00, %entry ]
+  %n.04 = phi i32 [ %inc, %for.body ], [ 1, %entry ]
+  %conv = sitofp i32 %n.04 to float
+  %add = fadd float %s1.06, %conv
+  %conv1 = sitofp i32 %m.addr.07 to float
+  %add2 = fadd float %s2.05, %conv1
+  %inc = add nsw i32 %n.04, 1
+  %dec = add nsw i32 %m.addr.07, -1
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %s1.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %s2.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add2, %for.body ]
+  %sub = fsub float %s1.0.lcssa, %s2.0.lcssa
+  ret float %sub
+}





More information about the llvm-commits mailing list