[llvm] r178773 - Avoid high-latency false CPSR dependencies even for tMOVSi.

Jakob Stoklund Olesen stoklund at 2pi.dk
Thu Apr 4 11:25:36 PDT 2013


Author: stoklund
Date: Thu Apr  4 13:25:36 2013
New Revision: 178773

URL: http://llvm.org/viewvc/llvm-project?rev=178773&view=rev
Log:
Avoid high-latency false CPSR dependencies even for tMOVSi.

The Thumb2SizeReduction pass avoids false CPSR dependencies, except it
still aggressively creates tMOVi8 instructions because they are so
common.

Avoid creating false CPSR dependencies even for tMOVi8 instructions when
the the CPSR flags are known to have high latency. This allows integer
computation to overlap floating point computations.

Also process blocks in a reverse post-order and propagate high-latency
flags to successors.

<rdar://problem/13468102>

Modified:
    llvm/trunk/lib/Target/ARM/Thumb2SizeReduction.cpp
    llvm/trunk/test/CodeGen/ARM/avoid-cpsr-rmw.ll

Modified: llvm/trunk/lib/Target/ARM/Thumb2SizeReduction.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/Thumb2SizeReduction.cpp?rev=178773&r1=178772&r2=178773&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/Thumb2SizeReduction.cpp (original)
+++ llvm/trunk/lib/Target/ARM/Thumb2SizeReduction.cpp Thu Apr  4 13:25:36 2013
@@ -15,6 +15,7 @@
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "Thumb2InstrInfo.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -79,11 +80,8 @@ namespace {
   { ARM::t2LSLrr, 0,            ARM::tLSLrr,   0,   0,   0,   1,  0,0, 1,0,1 },
   { ARM::t2LSRri, ARM::tLSRri,  0,             5,   0,   1,   0,  0,0, 1,0,1 },
   { ARM::t2LSRrr, 0,            ARM::tLSRrr,   0,   0,   0,   1,  0,0, 1,0,1 },
-  // FIXME: tMOVi8 and tMVN also partially update CPSR but they are less
-  // likely to cause issue in the loop. As a size / performance workaround,
-  // they are not marked as such.
-  { ARM::t2MOVi,  ARM::tMOVi8,  0,             8,   0,   1,   0,  0,0, 0,0,0 },
-  { ARM::t2MOVi16,ARM::tMOVi8,  0,             8,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2MOVi,  ARM::tMOVi8,  0,             8,   0,   1,   0,  0,0, 1,0,0 },
+  { ARM::t2MOVi16,ARM::tMOVi8,  0,             8,   0,   1,   0,  0,0, 1,1,0 },
   // FIXME: Do we need the 16-bit 'S' variant?
   { ARM::t2MOVr,ARM::tMOVr,     0,             0,   0,   0,   0,  1,0, 0,0,0 },
   { ARM::t2MUL,   0,            ARM::tMUL,     0,   0,   0,   1,  0,0, 1,0,0 },
@@ -149,8 +147,7 @@ namespace {
     /// ReduceOpcodeMap - Maps wide opcode to index of entry in ReduceTable.
     DenseMap<unsigned, unsigned> ReduceOpcodeMap;
 
-    bool canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use,
-                             bool IsSelfLoop);
+    bool canAddPseudoFlagDep(MachineInstr *Use, bool IsSelfLoop);
 
     bool VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry,
                          bool is2Addr, ARMCC::CondCodes Pred,
@@ -160,33 +157,46 @@ namespace {
                          const ReduceEntry &Entry);
 
     bool ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
-                       const ReduceEntry &Entry, bool LiveCPSR,
-                       MachineInstr *CPSRDef, bool IsSelfLoop);
+                       const ReduceEntry &Entry, bool LiveCPSR, bool IsSelfLoop);
 
     /// ReduceTo2Addr - Reduce a 32-bit instruction to a 16-bit two-address
     /// instruction.
     bool ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
-                       const ReduceEntry &Entry,
-                       bool LiveCPSR, MachineInstr *CPSRDef,
+                       const ReduceEntry &Entry, bool LiveCPSR,
                        bool IsSelfLoop);
 
     /// ReduceToNarrow - Reduce a 32-bit instruction to a 16-bit
     /// non-two-address instruction.
     bool ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
-                        const ReduceEntry &Entry,
-                        bool LiveCPSR, MachineInstr *CPSRDef,
+                        const ReduceEntry &Entry, bool LiveCPSR,
                         bool IsSelfLoop);
 
     /// ReduceMI - Attempt to reduce MI, return true on success.
     bool ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI,
-                  bool LiveCPSR, MachineInstr *CPSRDef,
-                  bool IsSelfLoop);
+                  bool LiveCPSR, bool IsSelfLoop);
 
     /// ReduceMBB - Reduce width of instructions in the specified basic block.
     bool ReduceMBB(MachineBasicBlock &MBB);
 
     bool OptimizeSize;
     bool MinimizeSize;
+
+    // Last instruction to define CPSR in the current block.
+    MachineInstr *CPSRDef;
+    // Was CPSR last defined by a high latency instruction?
+    // When CPSRDef is null, this refers to CPSR defs in predecessors.
+    bool HighLatencyCPSR;
+
+    struct MBBInfo {
+      // The flags leaving this block have high latency.
+      bool HighLatencyCPSR;
+      // Has this block been visited yet?
+      bool Visited;
+
+      MBBInfo() : HighLatencyCPSR(false), Visited(false) {}
+    };
+
+    SmallVector<MBBInfo, 8> BlockInfo;
   };
   char Thumb2SizeReduce::ID = 0;
 }
@@ -207,6 +217,16 @@ static bool HasImplicitCPSRDef(const MCI
   return false;
 }
 
+// Check for a likely high-latency flag def.
+static bool isHighLatencyCPSR(MachineInstr *Def) {
+  switch(Def->getOpcode()) {
+  case ARM::FMSTAT:
+  case ARM::tMUL:
+    return true;
+  }
+  return false;
+}
+
 /// canAddPseudoFlagDep - For A9 (and other out-of-order) implementations,
 /// the 's' 16-bit instruction partially update CPSR. Abort the
 /// transformation to avoid adding false dependency on last CPSR setting
@@ -225,20 +245,19 @@ static bool HasImplicitCPSRDef(const MCI
 /// In this case it would have been ok to narrow the mul.w to muls since there
 /// are indirect RAW dependency between the muls and the mul.w
 bool
-Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use,
-                                      bool FirstInSelfLoop) {
+Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Use, bool FirstInSelfLoop) {
   // Disable the check for -Oz (aka OptimizeForSizeHarder).
   if (MinimizeSize || !STI->avoidCPSRPartialUpdate())
     return false;
 
-  if (!Def)
+  if (!CPSRDef)
     // If this BB loops back to itself, conservatively avoid narrowing the
     // first instruction that does partial flag update.
-    return FirstInSelfLoop;
+    return HighLatencyCPSR || FirstInSelfLoop;
 
   SmallSet<unsigned, 2> Defs;
-  for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = Def->getOperand(i);
+  for (unsigned i = 0, e = CPSRDef->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = CPSRDef->getOperand(i);
     if (!MO.isReg() || MO.isUndef() || MO.isUse())
       continue;
     unsigned Reg = MO.getReg();
@@ -256,6 +275,16 @@ Thumb2SizeReduce::canAddPseudoFlagDep(Ma
       return false;
   }
 
+  // If the current CPSR has high latency, try to avoid the false dependency.
+  if (HighLatencyCPSR)
+    return true;
+
+  // tMOVi8 usually doesn't start long dependency chains, and there are a lot
+  // of them, so always shrink them when CPSR doesn't have high latency.
+  if (Use->getOpcode() == ARM::t2MOVi ||
+      Use->getOpcode() == ARM::t2MOVi16)
+    return false;
+
   // No read-after-write dependency. The narrowing will add false dependency.
   return true;
 }
@@ -498,16 +527,15 @@ Thumb2SizeReduce::ReduceLoadStore(Machin
 bool
 Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
                                 const ReduceEntry &Entry,
-                                bool LiveCPSR, MachineInstr *CPSRDef,
-                                bool IsSelfLoop) {
+                                bool LiveCPSR, bool IsSelfLoop) {
   unsigned Opc = MI->getOpcode();
   if (Opc == ARM::t2ADDri) {
     // If the source register is SP, try to reduce to tADDrSPi, otherwise
     // it's a normal reduce.
     if (MI->getOperand(1).getReg() != ARM::SP) {
-      if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop))
+      if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
         return true;
-      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop);
+      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
     }
     // Try to reduce to tADDrSPi.
     unsigned Imm = MI->getOperand(2).getImm();
@@ -557,12 +585,12 @@ Thumb2SizeReduce::ReduceSpecial(MachineB
       switch (Opc) {
       default: break;
       case ARM::t2ADDSri: {
-        if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop))
+        if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
           return true;
         // fallthrough
       }
       case ARM::t2ADDSrr:
-        return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop);
+        return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
       }
     }
     break;
@@ -574,13 +602,13 @@ Thumb2SizeReduce::ReduceSpecial(MachineB
   case ARM::t2UXTB:
   case ARM::t2UXTH:
     if (MI->getOperand(2).getImm() == 0)
-      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop);
+      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
     break;
   case ARM::t2MOVi16:
     // Can convert only 'pure' immediate operands, not immediates obtained as
     // globals' addresses.
     if (MI->getOperand(1).isImm())
-      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop);
+      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
     break;
   case ARM::t2CMPrr: {
     // Try to reduce to the lo-reg only version first. Why there are two
@@ -590,9 +618,9 @@ Thumb2SizeReduce::ReduceSpecial(MachineB
     // source insn opcode. So for now, we hack a local entry record to use.
     static const ReduceEntry NarrowEntry =
       { ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 0,1,0 };
-    if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, CPSRDef, IsSelfLoop))
+    if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, IsSelfLoop))
       return true;
-    return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop);
+    return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
   }
   }
   return false;
@@ -601,8 +629,7 @@ Thumb2SizeReduce::ReduceSpecial(MachineB
 bool
 Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
                                 const ReduceEntry &Entry,
-                                bool LiveCPSR, MachineInstr *CPSRDef,
-                                bool IsSelfLoop) {
+                                bool LiveCPSR, bool IsSelfLoop) {
 
   if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr))
     return false;
@@ -683,7 +710,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineB
   // Avoid adding a false dependency on partial flag update by some 16-bit
   // instructions which has the 's' bit set.
   if (Entry.PartFlag && NewMCID.hasOptionalDef() && HasCC &&
-      canAddPseudoFlagDep(CPSRDef, MI, IsSelfLoop))
+      canAddPseudoFlagDep(MI, IsSelfLoop))
     return false;
 
   // Add the 16-bit instruction.
@@ -720,8 +747,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineB
 bool
 Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
                                  const ReduceEntry &Entry,
-                                 bool LiveCPSR, MachineInstr *CPSRDef,
-                                 bool IsSelfLoop) {
+                                 bool LiveCPSR, bool IsSelfLoop) {
   if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit))
     return false;
 
@@ -780,7 +806,7 @@ Thumb2SizeReduce::ReduceToNarrow(Machine
   // Avoid adding a false dependency on partial flag update by some 16-bit
   // instructions which has the 's' bit set.
   if (Entry.PartFlag && NewMCID.hasOptionalDef() && HasCC &&
-      canAddPseudoFlagDep(CPSRDef, MI, IsSelfLoop))
+      canAddPseudoFlagDep(MI, IsSelfLoop))
     return false;
 
   // Add the 16-bit instruction.
@@ -865,8 +891,7 @@ static bool UpdateCPSRUse(MachineInstr &
 }
 
 bool Thumb2SizeReduce::ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI,
-                                bool LiveCPSR, MachineInstr *CPSRDef,
-                                bool IsSelfLoop) {
+                                bool LiveCPSR, bool IsSelfLoop) {
   unsigned Opcode = MI->getOpcode();
   DenseMap<unsigned, unsigned>::iterator OPI = ReduceOpcodeMap.find(Opcode);
   if (OPI == ReduceOpcodeMap.end())
@@ -875,16 +900,16 @@ bool Thumb2SizeReduce::ReduceMI(MachineB
 
   // Don't attempt normal reductions on "special" cases for now.
   if (Entry.Special)
-    return ReduceSpecial(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop);
+    return ReduceSpecial(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
 
   // Try to transform to a 16-bit two-address instruction.
   if (Entry.NarrowOpc2 &&
-      ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop))
+      ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
     return true;
 
   // Try to transform to a 16-bit non-two-address instruction.
   if (Entry.NarrowOpc1 &&
-      ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop))
+      ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
     return true;
 
   return false;
@@ -895,9 +920,27 @@ bool Thumb2SizeReduce::ReduceMBB(Machine
 
   // Yes, CPSR could be livein.
   bool LiveCPSR = MBB.isLiveIn(ARM::CPSR);
-  MachineInstr *CPSRDef = 0;
   MachineInstr *BundleMI = 0;
 
+  CPSRDef = 0;
+  HighLatencyCPSR = false;
+
+  // Check predecessors for the latest CPSRDef.
+  bool HasBackEdges = false;
+  for (MachineBasicBlock::pred_iterator
+       I = MBB.pred_begin(), E = MBB.pred_end(); I != E; ++I) {
+    const MBBInfo &PInfo = BlockInfo[(*I)->getNumber()];
+    if (!PInfo.Visited) {
+      // Since blocks are visited in RPO, this must be a back-edge.
+      HasBackEdges = true;
+      continue;
+    }
+    if (PInfo.HighLatencyCPSR) {
+      HighLatencyCPSR = true;
+      break;
+    }
+  }
+
   // If this BB loops back to itself, conservatively avoid narrowing the
   // first instruction that does partial flag update.
   bool IsSelfLoop = MBB.isSuccessor(&MBB);
@@ -911,13 +954,15 @@ bool Thumb2SizeReduce::ReduceMBB(Machine
       BundleMI = MI;
       continue;
     }
+    if (MI->isDebugValue())
+      continue;
 
     LiveCPSR = UpdateCPSRUse(*MI, LiveCPSR);
 
     // Does NextMII belong to the same bundle as MI?
     bool NextInSameBundle = NextMII != E && NextMII->isBundledWithPred();
 
-    if (ReduceMI(MBB, MI, LiveCPSR, CPSRDef, IsSelfLoop)) {
+    if (ReduceMI(MBB, MI, LiveCPSR, IsSelfLoop)) {
       Modified = true;
       MachineBasicBlock::instr_iterator I = prior(NextMII);
       MI = &*I;
@@ -944,14 +989,19 @@ bool Thumb2SizeReduce::ReduceMBB(Machine
     if (MI->isCall()) {
       // Calls don't really set CPSR.
       CPSRDef = 0;
+      HighLatencyCPSR = false;
       IsSelfLoop = false;
     } else if (DefCPSR) {
       // This is the last CPSR defining instruction.
       CPSRDef = MI;
+      HighLatencyCPSR = isHighLatencyCPSR(CPSRDef);
       IsSelfLoop = false;
     }
   }
 
+  MBBInfo &Info = BlockInfo[MBB.getNumber()];
+  Info.HighLatencyCPSR = HighLatencyCPSR;
+  Info.Visited = true;
   return Modified;
 }
 
@@ -967,9 +1017,16 @@ bool Thumb2SizeReduce::runOnMachineFunct
   MinimizeSize = FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
                                       Attribute::MinSize);
 
+  BlockInfo.clear();
+  BlockInfo.resize(MF.getNumBlockIDs());
+
+  // Visit blocks in reverse post-order so LastCPSRDef is known for all
+  // predecessors.
+  ReversePostOrderTraversal<MachineFunction*> RPOT(&MF);
   bool Modified = false;
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
-    Modified |= ReduceMBB(*I);
+  for (ReversePostOrderTraversal<MachineFunction*>::rpo_iterator
+       I = RPOT.begin(), E = RPOT.end(); I != E; ++I)
+    Modified |= ReduceMBB(**I);
   return Modified;
 }
 

Modified: llvm/trunk/test/CodeGen/ARM/avoid-cpsr-rmw.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/avoid-cpsr-rmw.ll?rev=178773&r1=178772&r2=178773&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/avoid-cpsr-rmw.ll (original)
+++ llvm/trunk/test/CodeGen/ARM/avoid-cpsr-rmw.ll Thu Apr  4 13:25:36 2013
@@ -83,3 +83,34 @@ while.body:
 while.end:
   ret void
 }
+
+; Avoid producing tMOVi8 after a high-latency flag-setting operation.
+; <rdar://problem/13468102>
+define void @t4(i32* nocapture %p, double* nocapture %q) {
+entry:
+; CHECK: t4
+; CHECK: vmrs APSR_nzcv, fpscr
+; CHECK: if.then
+; CHECK-NOT movs
+  %0 = load double* %q, align 4
+  %cmp = fcmp olt double %0, 1.000000e+01
+  %incdec.ptr1 = getelementptr inbounds i32* %p, i32 1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  store i32 7, i32* %p, align 4
+  %incdec.ptr2 = getelementptr inbounds i32* %p, i32 2
+  store i32 8, i32* %incdec.ptr1, align 4
+  store i32 9, i32* %incdec.ptr2, align 4
+  br label %if.end
+
+if.else:
+  store i32 3, i32* %p, align 4
+  %incdec.ptr5 = getelementptr inbounds i32* %p, i32 2
+  store i32 5, i32* %incdec.ptr1, align 4
+  store i32 6, i32* %incdec.ptr5, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}





More information about the llvm-commits mailing list