[llvm] r205481 - [CodeGen] Teach the peephole optimizer to remember (and exploit) all folding

Wed Apr 2 15:59:58 PDT 2014

Author: lhames
Date: Wed Apr  2 17:59:58 2014
New Revision: 205481

URL: http://llvm.org/viewvc/llvm-project?rev=205481&view=rev
Log:
[CodeGen] Teach the peephole optimizer to remember (and exploit) all folding
opportunities in the current basic block, rather than just the last one seen.

<rdar://problem/16478629>


Added:
    llvm/trunk/test/CodeGen/X86/peephole-multiple-folds.ll
Modified:
    llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp

Modified: llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp?rev=205481&r1=205480&r2=205481&view=diff
==============================================================================

--- llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp (original)
+++ llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp Wed Apr  2 17:59:58 2014
@@ -133,7 +133,8 @@ namespace {
     bool foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
                        SmallSet<unsigned, 4> &ImmDefRegs,
                        DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
-    bool isLoadFoldable(MachineInstr *MI, unsigned &FoldAsLoadDefReg);
+    bool isLoadFoldable(MachineInstr *MI,
+                        SmallSet<unsigned, 16> &FoldAsLoadDefCandidates);
   };
 }
 
@@ -489,8 +490,9 @@ bool PeepholeOptimizer::optimizeCopyOrBi
 /// isLoadFoldable - Check whether MI is a candidate for folding into a later
 /// instruction. We only fold loads to virtual registers and the virtual
 /// register defined has a single use.
-bool PeepholeOptimizer::isLoadFoldable(MachineInstr *MI,
-                                       unsigned &FoldAsLoadDefReg) {
+bool PeepholeOptimizer::isLoadFoldable(
+                              MachineInstr *MI,
+                              SmallSet<unsigned, 16> &FoldAsLoadDefCandidates) {
   if (!MI->canFoldAsLoad() || !MI->mayLoad())
     return false;
   const MCInstrDesc &MCID = MI->getDesc();
@@ -504,7 +506,7 @@ bool PeepholeOptimizer::isLoadFoldable(M
   if (!MI->getOperand(0).getSubReg() &&
       TargetRegisterInfo::isVirtualRegister(Reg) &&
       MRI->hasOneNonDBGUse(Reg)) {
-    FoldAsLoadDefReg = Reg;
+    FoldAsLoadDefCandidates.insert(Reg);
     return true;
   }
   return false;
@@ -570,18 +572,14 @@ bool PeepholeOptimizer::runOnMachineFunc
 
   bool Changed = false;
 
-  SmallPtrSet<MachineInstr*, 8> LocalMIs;
-  SmallSet<unsigned, 4> ImmDefRegs;
-  DenseMap<unsigned, MachineInstr*> ImmDefMIs;
-  unsigned FoldAsLoadDefReg;
   for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
     MachineBasicBlock *MBB = &*I;
 
     bool SeenMoveImm = false;
-    LocalMIs.clear();
-    ImmDefRegs.clear();
-    ImmDefMIs.clear();
-    FoldAsLoadDefReg = 0;
+    SmallPtrSet<MachineInstr*, 8> LocalMIs;
+    SmallSet<unsigned, 4> ImmDefRegs;
+    DenseMap<unsigned, MachineInstr*> ImmDefMIs;
+    SmallSet<unsigned, 16> FoldAsLoadDefCandidates;
 
     for (MachineBasicBlock::iterator
            MII = I->begin(), MIE = I->end(); MII != MIE; ) {
@@ -595,15 +593,15 @@ bool PeepholeOptimizer::runOnMachineFunc
           continue;
 
       // If there exists an instruction which belongs to the following
-      // categories, we will discard the load candidate.
+      // categories, we will discard the load candidates.
       if (MI->isPosition() || MI->isPHI() || MI->isImplicitDef() ||
           MI->isKill() || MI->isInlineAsm() ||
           MI->hasUnmodeledSideEffects()) {
-        FoldAsLoadDefReg = 0;
+        FoldAsLoadDefCandidates.clear();
         continue;
       }
       if (MI->mayStore() || MI->isCall())
-        FoldAsLoadDefReg = 0;
+        FoldAsLoadDefCandidates.clear();
 
       if (((MI->isBitcast() || MI->isCopy()) && optimizeCopyOrBitcast(MI)) ||
           (MI->isCompare() && optimizeCmpInstr(MI, MBB)) ||
@@ -630,30 +628,41 @@ bool PeepholeOptimizer::runOnMachineFunc
       // Check whether MI is a load candidate for folding into a later
       // instruction. If MI is not a candidate, check whether we can fold an
       // earlier load into MI.
-      if (!isLoadFoldable(MI, FoldAsLoadDefReg) && FoldAsLoadDefReg) {
+      if (!isLoadFoldable(MI, FoldAsLoadDefCandidates) &&
+          !FoldAsLoadDefCandidates.empty()) {
         // We need to fold load after optimizeCmpInstr, since optimizeCmpInstr
         // can enable folding by converting SUB to CMP.
         // Save FoldAsLoadDefReg because optimizeLoadInstr() resets it and we
         // need it for markUsesInDebugValueAsUndef().
-        unsigned FoldedReg = FoldAsLoadDefReg;
-        MachineInstr *DefMI = 0;
-        MachineInstr *FoldMI = TII->optimizeLoadInstr(MI, MRI,
-                                                      FoldAsLoadDefReg, DefMI);
-        if (FoldMI) {
-          // Update LocalMIs since we replaced MI with FoldMI and deleted DefMI.
-          DEBUG(dbgs() << "Replacing: " << *MI);
-          DEBUG(dbgs() << "     With: " << *FoldMI);
-          LocalMIs.erase(MI);
-          LocalMIs.erase(DefMI);
-          LocalMIs.insert(FoldMI);
-          MI->eraseFromParent();
-          DefMI->eraseFromParent();
-          MRI->markUsesInDebugValueAsUndef(FoldedReg);
-          ++NumLoadFold;
-
-          // MI is replaced with FoldMI.
-          Changed = true;
-          continue;
+        const MCInstrDesc &MIDesc = MI->getDesc();
+        for (unsigned i = MIDesc.getNumDefs(); i != MIDesc.getNumOperands();
+             ++i) {
+          const MachineOperand &MOp = MI->getOperand(i);
+          if (!MOp.isReg())
+            continue;
+          unsigned TryFoldReg = MOp.getReg();
+          if (FoldAsLoadDefCandidates.count(TryFoldReg)) {
+            MachineInstr *DefMI = 0;
+            MachineInstr *FoldMI = TII->optimizeLoadInstr(MI, MRI, TryFoldReg,
+                                                          DefMI);
+            if (FoldMI) {
+              // Update LocalMIs since we replaced MI with FoldMI and deleted
+              // DefMI.
+              DEBUG(dbgs() << "Replacing: " << *MI);
+              DEBUG(dbgs() << "     With: " << *FoldMI);
+              LocalMIs.erase(MI);
+              LocalMIs.erase(DefMI);
+              LocalMIs.insert(FoldMI);
+              MI->eraseFromParent();
+              DefMI->eraseFromParent();
+              MRI->markUsesInDebugValueAsUndef(TryFoldReg);
+              FoldAsLoadDefCandidates.erase(TryFoldReg);
+              ++NumLoadFold;
+              // MI is replaced with FoldMI.
+              Changed = true;
+              break;
+            }
+          }
         }
       }
     }

Added: llvm/trunk/test/CodeGen/X86/peephole-multiple-folds.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/peephole-multiple-folds.ll?rev=205481&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/peephole-multiple-folds.ll (added)
+++ llvm/trunk/test/CodeGen/X86/peephole-multiple-folds.ll Wed Apr  2 17:59:58 2014
@@ -0,0 +1,29 @@
+; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s
+;
+; Test multiple peephole-time folds in a single basic block.
+; <rdar://problem/16478629>
+
+define <8 x float> @test_peephole_multi_fold(<8 x float>* %p1, <8 x float>* %p2) {
+entry:
+  br label %loopbody
+
+loopbody:
+; CHECK: _test_peephole_multi_fold:
+; CHECK: vfmadd231ps (%rdi),
+; CHECK: vfmadd231ps (%rsi),
+  %vsum1 = phi <8 x float> [ %vsum1.next, %loopbody ], [ zeroinitializer, %entry ]
+  %vsum2 = phi <8 x float> [ %vsum2.next, %loopbody ], [ zeroinitializer, %entry ]
+  %m1 = load <8 x float>* %p1, align 1
+  %m2 = load <8 x float>* %p2, align 1
+  %vsum1.next = tail call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %m1, <8 x float> zeroinitializer, <8 x float> %vsum1)
+  %vsum2.next = tail call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %m2, <8 x float> zeroinitializer, <8 x float> %vsum2)
+  %vsum1.next.1 = extractelement <8 x float> %vsum1.next, i32 0
+  %c = fcmp oeq float %vsum1.next.1, 0.0
+  br i1 %c, label %loopbody, label %loopexit
+
+loopexit:
+  %r = fadd <8 x float> %vsum1.next, %vsum2.next
+  ret <8 x float> %r
+}
+
+declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)