[llvm-commits] [llvm] r117675 - in /llvm/trunk: lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp lib/Target/ARM/ARMBaseInstrInfo.cpp test/CodeGen/Thumb2/2010-06-14-NEONCoalescer.ll test/CodeGen/Thumb2/cross-rc-coalescing-2.ll
Evan Cheng
evan.cheng at apple.com
Fri Oct 29 11:09:29 PDT 2010
Author: evancheng
Date: Fri Oct 29 13:09:28 2010
New Revision: 117675
URL: http://llvm.org/viewvc/llvm-project?rev=117675&view=rev
Log:
Avoiding overly aggressive latency scheduling. If the two nodes share an
operand and one of them has a single use that is a live out copy, favor the
one that is live out. Otherwise it will be difficult to eliminate the copy
if the instruction is a loop induction variable update. e.g.
BB:
sub r1, r3, #1
str r0, [r2, r3]
mov r3, r1
cmp
bne BB
=>
BB:
str r0, [r2, r3]
sub r3, r3, #1
cmp
bne BB
This fixed the recent 256.bzip2 regression.
Modified:
llvm/trunk/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
llvm/trunk/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp
llvm/trunk/test/CodeGen/Thumb2/2010-06-14-NEONCoalescer.ll
llvm/trunk/test/CodeGen/Thumb2/cross-rc-coalescing-2.ll
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp?rev=117675&r1=117674&r2=117675&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp Fri Oct 29 13:09:28 2010
@@ -190,7 +190,7 @@
void ScheduleDAGRRList::Schedule() {
DEBUG(dbgs()
<< "********** List Scheduling BB#" << BB->getNumber()
- << " **********\n");
+ << " '" << BB->getName() << "' **********\n");
NumLiveRegs = 0;
LiveRegDefs.resize(TRI->getNumRegs(), NULL);
@@ -1483,6 +1483,46 @@
return Scratches;
}
+/// hasOnlyLiveOutUse - Return true if SU has a single value successor that is a
+/// CopyToReg to a virtual register. This SU def is probably a liveout and
+/// it has no other use. It should be scheduled closer to the terminator.
+static bool hasOnlyLiveOutUses(const SUnit *SU) {
+ bool RetVal = false;
+ for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+ I != E; ++I) {
+ if (I->isCtrl()) continue;
+ const SUnit *SuccSU = I->getSUnit();
+ if (SuccSU->getNode() && SuccSU->getNode()->getOpcode() == ISD::CopyToReg) {
+ unsigned Reg =
+ cast<RegisterSDNode>(SuccSU->getNode()->getOperand(1))->getReg();
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ RetVal = true;
+ continue;
+ }
+ }
+ return false;
+ }
+ return RetVal;
+}
+
+/// UnitsSharePred - Return true if the two scheduling units share a common
+/// data predecessor.
+static bool UnitsSharePred(const SUnit *left, const SUnit *right) {
+ SmallSet<const SUnit*, 4> Preds;
+ for (SUnit::const_pred_iterator I = left->Preds.begin(),E = left->Preds.end();
+ I != E; ++I) {
+ if (I->isCtrl()) continue; // ignore chain preds
+ Preds.insert(I->getSUnit());
+ }
+ for (SUnit::const_pred_iterator I = right->Preds.begin(),E = right->Preds.end();
+ I != E; ++I) {
+ if (I->isCtrl()) continue; // ignore chain preds
+ if (Preds.count(I->getSUnit()))
+ return true;
+ }
+ return false;
+}
+
template <typename RRSort>
static bool BURRSort(const SUnit *left, const SUnit *right,
const RegReductionPriorityQueue<RRSort> *SPQ) {
@@ -1558,29 +1598,46 @@
else if (!LHigh && RHigh)
return false;
else if (!LHigh && !RHigh) {
+ // If the two nodes share an operand and one of them has a single
+ // use that is a live out copy, favor the one that is live out. Otherwise
+ // it will be difficult to eliminate the copy if the instruction is a
+ // loop induction variable update. e.g.
+ // BB:
+ // sub r1, r3, #1
+ // str r0, [r2, r3]
+ // mov r3, r1
+ // cmp
+ // bne BB
+ bool SharePred = UnitsSharePred(left, right);
+ // FIXME: Only adjust if BB is a loop back edge.
+ // FIXME: What's the cost of a copy?
+ int LBonus = (SharePred && hasOnlyLiveOutUses(left)) ? 1 : 0;
+ int RBonus = (SharePred && hasOnlyLiveOutUses(right)) ? 1 : 0;
+ int LHeight = (int)left->getHeight() - LBonus;
+ int RHeight = (int)right->getHeight() - RBonus;
+
// Low register pressure situation, schedule for latency if possible.
bool LStall = left->SchedulingPref == Sched::Latency &&
- SPQ->getCurCycle() < left->getHeight();
+ (int)SPQ->getCurCycle() < LHeight;
bool RStall = right->SchedulingPref == Sched::Latency &&
- SPQ->getCurCycle() < right->getHeight();
+ (int)SPQ->getCurCycle() < RHeight;
// If scheduling one of the node will cause a pipeline stall, delay it.
// If scheduling either one of the node will cause a pipeline stall, sort
// them according to their height.
- // If neither will cause a pipeline stall, try to reduce register pressure.
if (LStall) {
if (!RStall)
return true;
- if (left->getHeight() != right->getHeight())
- return left->getHeight() > right->getHeight();
+ if (LHeight != RHeight)
+ return LHeight > RHeight;
} else if (RStall)
return false;
- // If either node is scheduling for latency, sort them by height and latency
- // first.
+ // If either node is scheduling for latency, sort them by height
+ // and latency.
if (left->SchedulingPref == Sched::Latency ||
right->SchedulingPref == Sched::Latency) {
- if (left->getHeight() != right->getHeight())
- return left->getHeight() > right->getHeight();
+ if (LHeight != RHeight)
+ return LHeight > RHeight;
if (left->Latency != right->Latency)
return left->Latency > right->Latency;
}
@@ -1631,19 +1688,6 @@
return false;
}
-/// hasCopyToRegUse - Return true if SU has a value successor that is a
-/// CopyToReg node.
-static bool hasCopyToRegUse(const SUnit *SU) {
- for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
- I != E; ++I) {
- if (I->isCtrl()) continue;
- const SUnit *SuccSU = I->getSUnit();
- if (SuccSU->getNode() && SuccSU->getNode()->getOpcode() == ISD::CopyToReg)
- return true;
- }
- return false;
-}
-
/// canClobberPhysRegDefs - True if SU would clobber one of SuccSU's
/// physical register defs.
static bool canClobberPhysRegDefs(const SUnit *SuccSU, const SUnit *SU,
@@ -1813,6 +1857,7 @@
if (!Node || !Node->isMachineOpcode() || SU->getNode()->getFlaggedNode())
continue;
+ bool isLiveOut = hasOnlyLiveOutUses(SU);
unsigned Opc = Node->getMachineOpcode();
const TargetInstrDesc &TID = TII->get(Opc);
unsigned NumRes = TID.getNumDefs();
@@ -1862,7 +1907,7 @@
SuccOpc == TargetOpcode::SUBREG_TO_REG)
continue;
if ((!canClobber(SuccSU, DUSU) ||
- (hasCopyToRegUse(SU) && !hasCopyToRegUse(SuccSU)) ||
+ (isLiveOut && !hasOnlyLiveOutUses(SuccSU)) ||
(!SU->isCommutable && SuccSU->isCommutable)) &&
!scheduleDAG->IsReachable(SuccSU, SU)) {
DEBUG(dbgs() << " Adding a pseudo-two-addr edge from SU #"
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp?rev=117675&r1=117674&r2=117675&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp Fri Oct 29 13:09:28 2010
@@ -458,6 +458,15 @@
// Adjust the use operand index by num of defs.
OpIdx += TII->get(Use->getMachineOpcode()).getNumDefs();
int Latency = TII->getOperandLatency(InstrItins, Def, DefIdx, Use, OpIdx);
+ if (Latency > 1 && Use->getOpcode() == ISD::CopyToReg &&
+ !BB->succ_empty()) {
+ unsigned Reg = cast<RegisterSDNode>(Use->getOperand(1))->getReg();
+ if (TargetRegisterInfo::isVirtualRegister(Reg))
+ // This copy is a liveout value. It is likely coalesced, so reduce the
+ // latency so not to penalize the def.
+ // FIXME: need target specific adjustment here?
+ Latency = (Latency > 1) ? Latency - 1 : 1;
+ }
if (Latency >= 0)
dep.setLatency(Latency);
}
Modified: llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp?rev=117675&r1=117674&r2=117675&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp Fri Oct 29 13:09:28 2010
@@ -1967,8 +1967,13 @@
if (!ItinData || ItinData->isEmpty())
return DefTID.mayLoad() ? 3 : 1;
- if (!UseNode->isMachineOpcode())
- return ItinData->getOperandCycle(DefTID.getSchedClass(), DefIdx);
+ if (!UseNode->isMachineOpcode()) {
+ int Latency = ItinData->getOperandCycle(DefTID.getSchedClass(), DefIdx);
+ if (Subtarget.isCortexA9())
+ return Latency <= 2 ? 1 : Latency - 1;
+ else
+ return Latency <= 3 ? 1 : Latency - 2;
+ }
const TargetInstrDesc &UseTID = get(UseNode->getMachineOpcode());
const MachineSDNode *DefMN = dyn_cast<MachineSDNode>(DefNode);
Modified: llvm/trunk/test/CodeGen/Thumb2/2010-06-14-NEONCoalescer.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/2010-06-14-NEONCoalescer.ll?rev=117675&r1=117674&r2=117675&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/2010-06-14-NEONCoalescer.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/2010-06-14-NEONCoalescer.ll Fri Oct 29 13:09:28 2010
@@ -23,7 +23,10 @@
%4 = insertelement <2 x double> %2, double %V.0.ph, i32 1 ; <<2 x double>> [#uses=2]
; Constant pool load followed by add.
; Then clobber the loaded register, not the sum.
+; CHECK: vldr.64
+; CHECK: vadd.f64
; CHECK: vldr.64 [[LDR:d.*]],
+; CHECK: LPC0_0:
; CHECK: vadd.f64 [[ADD:d.*]], [[LDR]], [[LDR]]
; CHECK: vmov.f64 [[LDR]]
%5 = fadd <2 x double> %3, %3 ; <<2 x double>> [#uses=2]
Modified: llvm/trunk/test/CodeGen/Thumb2/cross-rc-coalescing-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/cross-rc-coalescing-2.ll?rev=117675&r1=117674&r2=117675&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/cross-rc-coalescing-2.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/cross-rc-coalescing-2.ll Fri Oct 29 13:09:28 2010
@@ -15,9 +15,9 @@
; Loop preheader
; CHECK: vmov.f32
-; CHECK: vmul.f32
; CHECK: vsub.f32
; CHECK: vadd.f32
+; CHECK: vmul.f32
bb7: ; preds = %bb9, %bb.nph
%s1.02 = phi float [ undef, %bb.nph ], [ %35, %bb9 ] ; <float> [#uses=3]
%tmp79 = add i32 undef, undef ; <i32> [#uses=1]
More information about the llvm-commits
mailing list