[llvm] r306758 - Remove redundant copy in recurrences

Thu Jun 29 16:11:24 PDT 2017

Author: twoh
Date: Thu Jun 29 16:11:24 2017
New Revision: 306758

URL: http://llvm.org/viewvc/llvm-project?rev=306758&view=rev
Log:
Remove redundant copy in recurrences

Summary:
If there is a chain of instructions formulating a recurrence, commuting operands can help removing a redundant copy. In the following example code,

```
BB#1: ; Loop Header
  %vreg0<def> = COPY %vreg13<kill>; GR32:%vreg0,%vreg13
  ...

BB#6: ; Loop Latch
  %vreg2<def> = COPY %vreg15<kill>; GR32:%vreg2,%vreg15
  %vreg10<def,tied1> = ADD32rr %vreg1<kill,tied0>, %vreg0<kill>, %EFLAGS<imp-def,dead>; GR32:%vreg10,%vreg1,%vreg0
  %vreg3<def,tied1> = ADD32rr %vreg2<kill,tied0>, %vreg10<kill>, %EFLAGS<imp-def,dead>; GR32:%vreg3,%vreg2,%vreg10
  CMP32ri8 %vreg3, 10, %EFLAGS<imp-def>; GR32:%vreg3
  %vreg13<def> = COPY %vreg3<kill>; GR32:%vreg13,%vreg3
  JL_1 <BB#1>, %EFLAGS<imp-use,kill>
```

Existing two-address generation pass generates following code:

```
BB#1:
  %vreg0<def> = COPY %vreg13<kill>; GR32:%vreg0,%vreg13
  ...

BB#6:
    Predecessors according to CFG: BB#5 BB#4
  %vreg2<def> = COPY %vreg15<kill>; GR32:%vreg2,%vreg15
  %vreg10<def> = COPY %vreg1<kill>; GR32:%vreg10,%vreg1
  %vreg10<def,tied1> = ADD32rr %vreg10<tied0>, %vreg0<kill>, %EFLAGS<imp-def,dead>; GR32:%vreg10,%vreg0
  %vreg3<def> = COPY %vreg10<kill>; GR32:%vreg3,%vreg10
  %vreg3<def,tied1> = ADD32rr %vreg3<tied0>, %vreg2<kill>, %EFLAGS<imp-def,dead>; GR32:%vreg3,%vreg2
  CMP32ri8 %vreg3, 10, %EFLAGS<imp-def>; GR32:%vreg3
  %vreg13<def> = COPY %vreg3<kill>; GR32:%vreg13,%vreg3
  JL_1 <BB#1>, %EFLAGS<imp-use,kill>
  JMP_1 <BB#7>
```

This is suboptimal because the assembly code generated has a redundant copy at the end of #BB6 to feed %vreg13 to BB#1:

```
.LBB0_6:
  addl  %esi, %edi
  addl  %ebx, %edi
  cmpl  $10, %edi
  movl  %edi, %esi
  jl  .LBB0_1
```

This redundant copy can be elimiated by making instructions in the recurrence chain to compute the value "into" the register that actually holds the feedback value. In this example, this can be achieved by commuting %vreg0 and %vreg1 to compute %vreg10. With that change, code after two-address generation becomes

```
BB#1:
  %vreg0<def> = COPY %vreg13<kill>; GR32:%vreg0,%vreg13
  ...

BB#6: derived from LLVM BB %bb7
    Predecessors according to CFG: BB#5 BB#4
  %vreg2<def> = COPY %vreg15<kill>; GR32:%vreg2,%vreg15
  %vreg10<def> = COPY %vreg0<kill>; GR32:%vreg10,%vreg0
  %vreg10<def,tied1> = ADD32rr %vreg10<tied0>, %vreg1<kill>, %EFLAGS<imp-def,dead>; GR32:%vreg10,%vreg1
  %vreg3<def> = COPY %vreg10<kill>; GR32:%vreg3,%vreg10
  %vreg3<def,tied1> = ADD32rr %vreg3<tied0>, %vreg2<kill>, %EFLAGS<imp-def,dead>; GR32:%vreg3,%vreg2
  CMP32ri8 %vreg3, 10, %EFLAGS<imp-def>; GR32:%vreg3
  %vreg13<def> = COPY %vreg3<kill>; GR32:%vreg13,%vreg3
  JL_1 <BB#1>, %EFLAGS<imp-use,kill>
  JMP_1 <BB#7>
```

and the final assembly does not have redundant copy:

```
.LBB0_6:
  addl  %edi, %eax
  addl  %ebx, %eax
  cmpl  $10, %eax
  jl  .LBB0_1
```

Reviewers: qcolombet, MatzeB, wmi

Reviewed By: wmi

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D31821

Added:
    llvm/trunk/test/CodeGen/X86/peephole-recurrence.mir
Modified:
    llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp
    llvm/trunk/lib/CodeGen/TwoAddressInstructionPass.cpp
    llvm/trunk/test/CodeGen/MIR/Generic/multiRunPass.mir

Modified: llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp?rev=306758&r1=306757&r2=306758&view=diff
==============================================================================

--- llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp (original)
+++ llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp Thu Jun 29 16:11:24 2017
@@ -76,6 +76,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
@@ -119,6 +120,14 @@ static cl::opt<unsigned> RewritePHILimit
     "rewrite-phi-limit", cl::Hidden, cl::init(10),
     cl::desc("Limit the length of PHI chains to lookup"));
 
+// Limit the length of recurrence chain when evaluating the benefit of
+// commuting operands.
+static cl::opt<unsigned> MaxRecurrenceChain(
+    "recurrence-chain-limit", cl::Hidden, cl::init(3),
+    cl::desc("Maximum length of recurrence chain when evaluating the benefit "
+             "of commuting operands"));
+
+
 STATISTIC(NumReuse,      "Number of extension results reused");
 STATISTIC(NumCmps,       "Number of compares eliminated");
 STATISTIC(NumImmFold,    "Number of move immediate folded");
@@ -131,12 +140,14 @@ STATISTIC(NumNAPhysCopies, "Number of no
 namespace {
 
   class ValueTrackerResult;
+  class RecurrenceInstr;
 
   class PeepholeOptimizer : public MachineFunctionPass {
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
     MachineRegisterInfo   *MRI;
     MachineDominatorTree  *DT;  // Machine dominator tree
+    MachineLoopInfo       *MLI;
 
   public:
     static char ID; // Pass identification
@@ -150,6 +161,8 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       MachineFunctionPass::getAnalysisUsage(AU);
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
       if (Aggressive) {
         AU.addRequired<MachineDominatorTree>();
         AU.addPreserved<MachineDominatorTree>();
@@ -160,6 +173,9 @@ namespace {
     typedef SmallDenseMap<TargetInstrInfo::RegSubRegPair, ValueTrackerResult>
         RewriteMapTy;
 
+    /// \brief Sequence of instructions that formulate recurrence cycle.
+    typedef SmallVector<RecurrenceInstr, 4> RecurrenceCycle;
+
   private:
     bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB);
     bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
@@ -170,6 +186,7 @@ namespace {
     bool optimizeCoalescableCopy(MachineInstr *MI);
     bool optimizeUncoalescableCopy(MachineInstr *MI,
                                    SmallPtrSetImpl<MachineInstr *> &LocalMIs);
+    bool optimizeRecurrence(MachineInstr &PHI);
     bool findNextSource(unsigned Reg, unsigned SubReg,
                         RewriteMapTy &RewriteMap);
     bool isMoveImmediate(MachineInstr *MI,
@@ -178,6 +195,13 @@ namespace {
     bool foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
                        SmallSet<unsigned, 4> &ImmDefRegs,
                        DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
+    /// \brief Finds recurrence cycles, but only ones that formulated around
+    /// a def operand and a use operand that are tied. If there is a use
+    /// operand commutable with the tied use operand, find recurrence cycle
+    /// along that operand as well.
+    bool findTargetRecurrence(unsigned Reg,
+                              const SmallSet<unsigned, 2> &TargetReg,
+                              RecurrenceCycle &RC);
 
     /// \brief If copy instruction \p MI is a virtual register copy, track it in
     /// the set \p CopySrcRegs and \p CopyMIs. If this virtual register was
@@ -222,6 +246,28 @@ namespace {
     }
   };
 
+  /// \brief Helper class to hold instructions that are inside recurrence
+  /// cycles. The recurrence cycle is formulated around 1) a def operand and its
+  /// tied use operand, or 2) a def operand and a use operand that is commutable
+  /// with another use operand which is tied to the def operand. In the latter
+  /// case, index of the tied use operand and the commutable use operand are
+  /// maintained with CommutePair.
+  class RecurrenceInstr {
+  public:
+    typedef std::pair<unsigned, unsigned> IndexPair;
+
+    RecurrenceInstr(MachineInstr *MI) : MI(MI) {}
+    RecurrenceInstr(MachineInstr *MI, unsigned Idx1, unsigned Idx2)
+      : MI(MI), CommutePair(std::make_pair(Idx1, Idx2)) {}
+
+    MachineInstr *getMI() const { return MI; }
+    Optional<IndexPair> getCommutePair() const { return CommutePair; }
+
+  private:
+    MachineInstr *MI;
+    Optional<IndexPair> CommutePair;
+  };
+
   /// \brief Helper class to hold a reply for ValueTracker queries. Contains the
   /// returned sources for a given search and the instructions where the sources
   /// were tracked from.
@@ -412,6 +458,7 @@ char &llvm::PeepholeOptimizerID = Peepho
 INITIALIZE_PASS_BEGIN(PeepholeOptimizer, DEBUG_TYPE,
                 "Peephole Optimizations", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_END(PeepholeOptimizer, DEBUG_TYPE,
                 "Peephole Optimizations", false, false)
 
@@ -1487,6 +1534,113 @@ bool PeepholeOptimizer::foldRedundantNAP
   return false;
 }
 
+/// \bried Returns true if \p MO is a virtual register operand.
+static bool isVirtualRegisterOperand(MachineOperand &MO) {
+  if (!MO.isReg())
+    return false;
+  return TargetRegisterInfo::isVirtualRegister(MO.getReg());
+}
+
+bool PeepholeOptimizer::findTargetRecurrence(
+    unsigned Reg, const SmallSet<unsigned, 2> &TargetRegs,
+    RecurrenceCycle &RC) {
+  // Recurrence found if Reg is in TargetRegs.
+  if (TargetRegs.count(Reg))
+    return true;
+
+  // TODO: Curerntly, we only allow the last instruction of the recurrence
+  // cycle (the instruction that feeds the PHI instruction) to have more than
+  // one uses to guarantee that commuting operands does not tie registers
+  // with overlapping live range. Once we have actual live range info of
+  // each register, this constraint can be relaxed.
+  if (!MRI->hasOneNonDBGUse(Reg))
+    return false;
+
+  // Give up if the reccurrence chain length is longer than the limit.
+  if (RC.size() >= MaxRecurrenceChain)
+    return false;
+
+  MachineInstr &MI = *(MRI->use_instr_nodbg_begin(Reg));
+  unsigned Idx = MI.findRegisterUseOperandIdx(Reg);
+
+  // Only interested in recurrences whose instructions have only one def, which
+  // is a virtual register.
+  if (MI.getDesc().getNumDefs() != 1)
+    return false;
+
+  MachineOperand &DefOp = MI.getOperand(0);
+  if (!isVirtualRegisterOperand(DefOp))
+    return false;
+
+  // Check if def operand of MI is tied to any use operand. We are only
+  // interested in the case that all the instructions in the recurrence chain
+  // have there def operand tied with one of the use operand.
+  unsigned TiedUseIdx;
+  if (!MI.isRegTiedToUseOperand(0, &TiedUseIdx))
+    return false;
+
+  if (Idx == TiedUseIdx) {
+    RC.push_back(RecurrenceInstr(&MI));
+    return findTargetRecurrence(DefOp.getReg(), TargetRegs, RC);
+  } else {
+    // If Idx is not TiedUseIdx, check if Idx is commutable with TiedUseIdx.
+    unsigned CommIdx = TargetInstrInfo::CommuteAnyOperandIndex;
+    if (TII->findCommutedOpIndices(MI, Idx, CommIdx) && CommIdx == TiedUseIdx) {
+      RC.push_back(RecurrenceInstr(&MI, Idx, CommIdx));
+      return findTargetRecurrence(DefOp.getReg(), TargetRegs, RC);
+    }
+  }
+
+  return false;
+}
+
+/// \brief Phi instructions will eventually be lowered to copy instructions. If
+/// phi is in a loop header, a recurrence may formulated around the source and
+/// destination of the phi. For such case commuting operands of the instructions
+/// in the recurrence may enable coalescing of the copy instruction generated
+/// from the phi. For example, if there is a recurrence of
+///
+/// LoopHeader:
+///   %vreg1 = phi(%vreg0, %vreg100)
+/// LoopLatch:
+///   %vreg0<def, tied1> = ADD %vreg2<def, tied0>, %vreg1
+///
+/// , the fact that vreg0 and vreg2 are in the same tied operands set makes
+/// the coalescing of copy instruction generated from the phi in
+/// LoopHeader(i.e. %vreg1 = COPY %vreg0) impossible, because %vreg1 and
+/// %vreg2 have overlapping live range. This introduces additional move
+/// instruction to the final assembly. However, if we commute %vreg2 and
+/// %vreg1 of ADD instruction, the redundant move instruction can be
+/// avoided.
+bool PeepholeOptimizer::optimizeRecurrence(MachineInstr &PHI) {
+  SmallSet<unsigned, 2> TargetRegs;
+  for (unsigned Idx = 1; Idx < PHI.getNumOperands(); Idx += 2) {
+    MachineOperand &MO = PHI.getOperand(Idx);
+    assert(isVirtualRegisterOperand(MO) && "Invalid PHI instruction");
+    TargetRegs.insert(MO.getReg());
+  }
+
+  bool Changed = false;
+  RecurrenceCycle RC;
+  if (findTargetRecurrence(PHI.getOperand(0).getReg(), TargetRegs, RC)) {
+    // Commutes operands of instructions in RC if necessary so that the copy to
+    // be generated from PHI can be coalesced.
+    DEBUG(dbgs() << "Optimize recurrence chain from " << PHI);
+    for (auto &RI : RC) {
+      DEBUG(dbgs() << "\tInst: " << *(RI.getMI()));
+      auto CP = RI.getCommutePair();
+      if (CP) {
+        Changed = true;
+        TII->commuteInstruction(*(RI.getMI()), false, (*CP).first,
+                                (*CP).second);
+        DEBUG(dbgs() << "\t\tCommuted: " << *(RI.getMI()));
+      }
+    }
+  }
+
+  return Changed;
+}
+
 bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(*MF.getFunction()))
     return false;
@@ -1501,6 +1655,7 @@ bool PeepholeOptimizer::runOnMachineFunc
   TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
   DT  = Aggressive ? &getAnalysis<MachineDominatorTree>() : nullptr;
+  MLI = &getAnalysis<MachineLoopInfo>();
 
   bool Changed = false;
 
@@ -1529,6 +1684,8 @@ bool PeepholeOptimizer::runOnMachineFunc
     SmallSet<unsigned, 4> CopySrcRegs;
     DenseMap<unsigned, MachineInstr *> CopySrcMIs;
 
+    bool IsLoopHeader = MLI->isLoopHeader(&MBB);
+
     for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
          MII != MIE; ) {
       MachineInstr *MI = &*MII;
@@ -1540,9 +1697,16 @@ bool PeepholeOptimizer::runOnMachineFunc
       if (MI->isDebugValue())
           continue;
 
-      if (MI->isPosition() || MI->isPHI())
+      if (MI->isPosition())
         continue;
 
+      if (IsLoopHeader && MI->isPHI()) {
+        if (optimizeRecurrence(*MI)) {
+          Changed = true;
+          continue;
+        }
+      }
+
       if (!MI->isCopy()) {
         for (const auto &Op : MI->operands()) {
           // Visit all operands: definitions can be implicit or explicit.
@@ -1667,7 +1831,7 @@ bool PeepholeOptimizer::runOnMachineFunc
               MRI->markUsesInDebugValueAsUndef(FoldedReg);
               FoldAsLoadDefCandidates.erase(FoldedReg);
               ++NumLoadFold;
-              
+
               // MI is replaced with FoldMI so we can continue trying to fold
               Changed = true;
               MI = FoldMI;
@@ -1675,7 +1839,7 @@ bool PeepholeOptimizer::runOnMachineFunc
           }
         }
       }
-      
+
       // If we run into an instruction we can't fold across, discard
       // the load candidates.  Note: We might be able to fold *into* this
       // instruction, so this needs to be after the folding logic.

Modified: llvm/trunk/lib/CodeGen/TwoAddressInstructionPass.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/TwoAddressInstructionPass.cpp?rev=306758&r1=306757&r2=306758&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/TwoAddressInstructionPass.cpp (original)
+++ llvm/trunk/lib/CodeGen/TwoAddressInstructionPass.cpp Thu Jun 29 16:11:24 2017
@@ -68,6 +68,13 @@ EnableRescheduling("twoaddr-reschedule",
                    cl::desc("Coalesce copies by rescheduling (default=true)"),
                    cl::init(true), cl::Hidden);
 
+// Limit the number of dataflow edges to traverse when evaluating the benefit
+// of commuting operands.
+static cl::opt<unsigned> MaxDataFlowEdge(
+    "dataflow-edge-limit", cl::Hidden, cl::init(3),
+    cl::desc("Maximum number of dataflow edges to traverse when evaluating "
+             "the benefit of commuting operands"));
+
 namespace {
 class TwoAddressInstructionPass : public MachineFunctionPass {
   MachineFunction *MF;
@@ -637,10 +644,10 @@ isProfitableToCommute(unsigned regA, uns
   // To more generally minimize register copies, ideally the logic of two addr
   // instruction pass should be integrated with register allocation pass where
   // interference graph is available.
-  if (isRevCopyChain(regC, regA, 3))
+  if (isRevCopyChain(regC, regA, MaxDataFlowEdge))
     return true;
 
-  if (isRevCopyChain(regB, regA, 3))
+  if (isRevCopyChain(regB, regA, MaxDataFlowEdge))
     return false;
 
   // Since there are no intervening uses for both registers, then commute

Modified: llvm/trunk/test/CodeGen/MIR/Generic/multiRunPass.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/MIR/Generic/multiRunPass.mir?rev=306758&r1=306757&r2=306758&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/MIR/Generic/multiRunPass.mir (original)
+++ llvm/trunk/test/CodeGen/MIR/Generic/multiRunPass.mir Thu Jun 29 16:11:24 2017
@@ -7,7 +7,8 @@
 # This test ensures that the command line accepts
 # several run passes on the same command line and
 # actually create the proper pipeline for it.
-# PSEUDO_PEEPHOLE: -expand-isel-pseudos {{(-machineverifier )?}}-peephole-opt
+# PSEUDO_PEEPHOLE: -expand-isel-pseudos
+# PSEUDO_PEEPHOLE-SAME: {{(-machineverifier )?}}-peephole-opt
 # PEEPHOLE_PSEUDO: -peephole-opt {{(-machineverifier )?}}-expand-isel-pseudos
 
 # Make sure there are no other passes happening after what we asked.

Added: llvm/trunk/test/CodeGen/X86/peephole-recurrence.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/peephole-recurrence.mir?rev=306758&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/peephole-recurrence.mir (added)
+++ llvm/trunk/test/CodeGen/X86/peephole-recurrence.mir Thu Jun 29 16:11:24 2017
@@ -0,0 +1,232 @@
+# RUN: llc -mtriple=x86_64-- -run-pass=peephole-opt -o - %s | FileCheck %s
+
+--- |
+  define i32 @foo(i32 %a) {
+  bb0:
+    br label %bb1
+  
+  bb1:                                              ; preds = %bb7, %bb0
+    %vreg0 = phi i32 [ 0, %bb0 ], [ %vreg3, %bb7 ]
+    %cond0 = icmp eq i32 %a, 0
+    br i1 %cond0, label %bb4, label %bb3
+  
+  bb3:                                              ; preds = %bb1
+    br label %bb4
+  
+  bb4:                                              ; preds = %bb1, %bb3
+    %vreg5 = phi i32 [ 2, %bb3 ], [ 1, %bb1 ]
+    %cond1 = icmp eq i32 %vreg5, 0
+    br i1 %cond1, label %bb7, label %bb6
+  
+  bb6:                                              ; preds = %bb4
+    br label %bb7
+  
+  bb7:                                              ; preds = %bb4, %bb6
+    %vreg1 = phi i32 [ 2, %bb6 ], [ 1, %bb4 ]
+    %vreg2 = add i32 %vreg5, %vreg0
+    %vreg3 = add i32 %vreg1, %vreg2
+    %cond2 = icmp slt i32 %vreg3, 10
+    br i1 %cond2, label %bb1, label %bb8
+  
+  bb8:                                              ; preds = %bb7
+    ret i32 0
+  }
+  
+  define i32 @bar(i32 %a, i32* %p) {
+  bb0:
+    br label %bb1
+  
+  bb1:                                              ; preds = %bb7, %bb0
+    %vreg0 = phi i32 [ 0, %bb0 ], [ %vreg3, %bb7 ]
+    %cond0 = icmp eq i32 %a, 0
+    br i1 %cond0, label %bb4, label %bb3
+  
+  bb3:                                              ; preds = %bb1
+    br label %bb4
+  
+  bb4:                                              ; preds = %bb1, %bb3
+    %vreg5 = phi i32 [ 2, %bb3 ], [ 1, %bb1 ]
+    %cond1 = icmp eq i32 %vreg5, 0
+    br i1 %cond1, label %bb7, label %bb6
+  
+  bb6:                                              ; preds = %bb4
+    br label %bb7
+  
+  bb7:                                              ; preds = %bb4, %bb6
+    %vreg1 = phi i32 [ 2, %bb6 ], [ 1, %bb4 ]
+    %vreg2 = add i32 %vreg5, %vreg0
+    store i32 %vreg0, i32* %p
+    %vreg3 = add i32 %vreg1, %vreg2
+    %cond2 = icmp slt i32 %vreg3, 10
+    br i1 %cond2, label %bb1, label %bb8
+  
+  bb8:                                              ; preds = %bb7
+    ret i32 0
+  }
+
+...
+---
+# There is a recurrence formulated around %0, %10, and %3. Check that operands
+# are commuted for ADD instructions in bb.5.bb7 so that the values involved in
+# the recurrence are tied. This will remove redundant copy instruction.
+name:            foo
+tracksRegLiveness: true
+registers:       
+  - { id: 0, class: gr32, preferred-register: '' }
+  - { id: 1, class: gr32, preferred-register: '' }
+  - { id: 2, class: gr32, preferred-register: '' }
+  - { id: 3, class: gr32, preferred-register: '' }
+  - { id: 4, class: gr32, preferred-register: '' }
+  - { id: 5, class: gr32, preferred-register: '' }
+  - { id: 6, class: gr32, preferred-register: '' }
+  - { id: 7, class: gr32, preferred-register: '' }
+  - { id: 8, class: gr32, preferred-register: '' }
+  - { id: 9, class: gr32, preferred-register: '' }
+  - { id: 10, class: gr32, preferred-register: '' }
+  - { id: 11, class: gr32, preferred-register: '' }
+  - { id: 12, class: gr32, preferred-register: '' }
+liveins:         
+  - { reg: '%edi', virtual-reg: '%4' }
+body:             |
+  bb.0.bb0:
+    successors: %bb.1.bb1(0x80000000)
+    liveins: %edi
+  
+    %4 = COPY %edi
+    %5 = MOV32r0 implicit-def dead %eflags
+  
+  bb.1.bb1:
+    successors: %bb.3.bb4(0x30000000), %bb.2.bb3(0x50000000)
+  
+    ; CHECK: %0 = PHI %5, %bb.0.bb0, %3, %bb.5.bb7
+    %0 = PHI %5, %bb.0.bb0, %3, %bb.5.bb7
+    %6 = MOV32ri 1
+    TEST32rr %4, %4, implicit-def %eflags
+    JE_1 %bb.3.bb4, implicit %eflags
+    JMP_1 %bb.2.bb3
+  
+  bb.2.bb3:
+    successors: %bb.3.bb4(0x80000000)
+  
+    %7 = MOV32ri 2
+  
+  bb.3.bb4:
+    successors: %bb.5.bb7(0x30000000), %bb.4.bb6(0x50000000)
+  
+    %1 = PHI %6, %bb.1.bb1, %7, %bb.2.bb3
+    TEST32rr %1, %1, implicit-def %eflags
+    JE_1 %bb.5.bb7, implicit %eflags
+    JMP_1 %bb.4.bb6
+  
+  bb.4.bb6:
+    successors: %bb.5.bb7(0x80000000)
+  
+    %9 = MOV32ri 2
+  
+  bb.5.bb7:
+    successors: %bb.1.bb1(0x7c000000), %bb.6.bb8(0x04000000)
+  
+    %2 = PHI %6, %bb.3.bb4, %9, %bb.4.bb6
+    %10 = ADD32rr %1, %0, implicit-def dead %eflags
+    ; CHECK: %10 = ADD32rr
+    ; CHECK-SAME: %0,
+    ; CHECK-SAME: %1,
+    %3 = ADD32rr %2, killed %10, implicit-def dead %eflags
+    ; CHECK: %3 = ADD32rr
+    ; CHECK-SAME: %10,
+    ; CHECK-SAME: %2,
+    %11 = SUB32ri8 %3, 10, implicit-def %eflags
+    JL_1 %bb.1.bb1, implicit %eflags
+    JMP_1 %bb.6.bb8
+  
+  bb.6.bb8:
+    %12 = MOV32r0 implicit-def dead %eflags
+    %eax = COPY %12
+    RET 0, %eax
+
+...
+---
+# Here a recurrence is formulated around %0, %11, and %3, but operands should
+# not be commuted because %0 has a use outside of recurrence. This is to
+# prevent the case of commuting operands ties the values with overlapping live
+# ranges. 
+name:            bar
+tracksRegLiveness: true
+registers:       
+  - { id: 0, class: gr32, preferred-register: '' }
+  - { id: 1, class: gr32, preferred-register: '' }
+  - { id: 2, class: gr32, preferred-register: '' }
+  - { id: 3, class: gr32, preferred-register: '' }
+  - { id: 4, class: gr32, preferred-register: '' }
+  - { id: 5, class: gr64, preferred-register: '' }
+  - { id: 6, class: gr32, preferred-register: '' }
+  - { id: 7, class: gr32, preferred-register: '' }
+  - { id: 8, class: gr32, preferred-register: '' }
+  - { id: 9, class: gr32, preferred-register: '' }
+  - { id: 10, class: gr32, preferred-register: '' }
+  - { id: 11, class: gr32, preferred-register: '' }
+  - { id: 12, class: gr32, preferred-register: '' }
+  - { id: 13, class: gr32, preferred-register: '' }
+liveins:         
+  - { reg: '%edi', virtual-reg: '%4' }
+  - { reg: '%rsi', virtual-reg: '%5' }
+body:             |
+  bb.0.bb0:
+    successors: %bb.1.bb1(0x80000000)
+    liveins: %edi, %rsi
+  
+    %5 = COPY %rsi
+    %4 = COPY %edi
+    %6 = MOV32r0 implicit-def dead %eflags
+  
+  bb.1.bb1:
+    successors: %bb.3.bb4(0x30000000), %bb.2.bb3(0x50000000)
+  
+    %0 = PHI %6, %bb.0.bb0, %3, %bb.5.bb7
+    ; CHECK: %0 = PHI %6, %bb.0.bb0, %3, %bb.5.bb7
+    %7 = MOV32ri 1
+    TEST32rr %4, %4, implicit-def %eflags
+    JE_1 %bb.3.bb4, implicit %eflags
+    JMP_1 %bb.2.bb3
+  
+  bb.2.bb3:
+    successors: %bb.3.bb4(0x80000000)
+  
+    %8 = MOV32ri 2
+  
+  bb.3.bb4:
+    successors: %bb.5.bb7(0x30000000), %bb.4.bb6(0x50000000)
+  
+    %1 = PHI %7, %bb.1.bb1, %8, %bb.2.bb3
+    TEST32rr %1, %1, implicit-def %eflags
+    JE_1 %bb.5.bb7, implicit %eflags
+    JMP_1 %bb.4.bb6
+  
+  bb.4.bb6:
+    successors: %bb.5.bb7(0x80000000)
+  
+    %10 = MOV32ri 2
+  
+  bb.5.bb7:
+    successors: %bb.1.bb1(0x7c000000), %bb.6.bb8(0x04000000)
+  
+    %2 = PHI %7, %bb.3.bb4, %10, %bb.4.bb6
+    %11 = ADD32rr %1, %0, implicit-def dead %eflags
+    ; CHECK: %11 = ADD32rr
+    ; CHECK-SAME: %1,
+    ; CHECK-SAME: %0,
+    MOV32mr %5, 1, _, 0, _, %0 :: (store 4 into %ir.p)
+    %3 = ADD32rr %2, killed %11, implicit-def dead %eflags
+    ; CHECK: %3 = ADD32rr
+    ; CHECK-SAME: %2,
+    ; CHECK-SAME: %11,
+    %12 = SUB32ri8 %3, 10, implicit-def %eflags
+    JL_1 %bb.1.bb1, implicit %eflags
+    JMP_1 %bb.6.bb8
+  
+  bb.6.bb8:
+    %13 = MOV32r0 implicit-def dead %eflags
+    %eax = COPY %13
+    RET 0, %eax
+
+...