[llvm] r373630 - AMDGPU/SILoadStoreOptimizer: Optimize scanning for mergeable instructions

Thu Oct 3 10:11:48 PDT 2019

Author: tstellar
Date: Thu Oct  3 10:11:47 2019
New Revision: 373630

URL: http://llvm.org/viewvc/llvm-project?rev=373630&view=rev
Log:
AMDGPU/SILoadStoreOptimizer: Optimize scanning for mergeable instructions

Summary:
This adds a pre-pass to this optimization that scans through the basic
block and generates lists of mergeable instructions with one list per unique
address.

In the optimization phase instead of scanning through the basic block for mergeable
instructions, we now iterate over the lists generated by the pre-pass.

The decision to re-optimize a block is now made per list, so if we fail to merge any
instructions with the same address, then we do not attempt to optimize them in
future passes over the block.  This will help to reduce the time this pass
spends re-optimizing instructions.

In one pathological test case, this change reduces the time spent in the
SILoadStoreOptimizer from 0.2s to 0.03s.

This restructuring will also make it possible to implement further solutions in
this pass, because we can now add less expensive checks to the pre-pass and
filter instructions out early which will avoid the need to do the expensive
scanning during the optimization pass. For example, checking for adjacent
offsets is an inexpensive test we can move to the pre-pass.

Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin

Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D65961

Modified:
    llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

Modified: llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp?rev=373630&r1=373629&r2=373630&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp Thu Oct  3 10:11:47 2019
@@ -161,6 +161,31 @@ class SILoadStoreOptimizer : public Mach
       return true;
     }
 
+    bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
+      for (unsigned i = 0; i < NumAddresses; ++i) {
+        const MachineOperand *AddrOp = AddrReg[i];
+        // Immediates are always OK.
+        if (AddrOp->isImm())
+          continue;
+
+        // Don't try to merge addresses that aren't either immediates or registers.
+        // TODO: Should be possible to merge FrameIndexes and maybe some other
+        // non-register
+        if (!AddrOp->isReg())
+          return false;
+
+        // TODO: We should be able to merge physical reg addreses.
+        if (Register::isPhysicalRegister(AddrOp->getReg()))
+          return false;
+
+        // If an address has only one use then there will be on other
+        // instructions with the same address, so we can't merge this one.
+        if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
+          return false;
+      }
+      return true;
+    }
+
     void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII,
                const GCNSubtarget &STM);
     void setPaired(MachineBasicBlock::iterator MI, const SIInstrInfo &TII);
@@ -220,6 +245,10 @@ private:
   bool promoteConstantOffsetToImm(MachineInstr &CI,
                                   MemInfoMap &Visited,
                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
+  void addInstToMergeableList(const CombineInfo &CI,
+                  std::list<std::list<CombineInfo> > &MergeableInsts) const;
+  bool collectMergeableInsts(MachineBasicBlock &MBB,
+                  std::list<std::list<CombineInfo> > &MergeableInsts) const;
 
 public:
   static char ID;
@@ -228,7 +257,11 @@ public:
     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
   }
 
-  bool optimizeBlock(MachineBasicBlock &MBB);
+  void removeCombinedInst(std::list<CombineInfo> &MergeList,
+                                         const MachineInstr &MI);
+  bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
+                                     bool &OptimizeListAgain);
+  bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -424,6 +457,8 @@ void SILoadStoreOptimizer::CombineInfo::
     AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]);
     AddrReg[i] = &I->getOperand(AddrIdx[i]);
   }
+
+  InstsToMove.clear();
 }
 
 void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI,
@@ -646,15 +681,6 @@ bool SILoadStoreOptimizer::findMatchingI
   if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
     return false;
 
-  for (unsigned i = 0; i < CI.NumAddresses; i++) {
-    // We only ever merge operations with the same base address register, so
-    // don't bother scanning forward if there are no other uses.
-    if (CI.AddrReg[i]->isReg() &&
-        (Register::isPhysicalRegister(CI.AddrReg[i]->getReg()) ||
-         MRI->hasOneNonDBGUse(CI.AddrReg[i]->getReg())))
-      return false;
-  }
-
   ++MBBI;
 
   DenseSet<unsigned> RegDefsToMove;
@@ -827,12 +853,11 @@ SILoadStoreOptimizer::mergeRead2Pair(Com
 
   moveInstsAfter(Copy1, CI.InstsToMove);
 
-  MachineBasicBlock::iterator Next = std::next(CI.I);
   CI.I->eraseFromParent();
   CI.Paired->eraseFromParent();
 
   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
-  return Next;
+  return Read2;
 }
 
 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
@@ -911,12 +936,11 @@ SILoadStoreOptimizer::mergeWrite2Pair(Co
 
   moveInstsAfter(Write2, CI.InstsToMove);
 
-  MachineBasicBlock::iterator Next = std::next(CI.I);
   CI.I->eraseFromParent();
   CI.Paired->eraseFromParent();
 
   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
-  return Next;
+  return Write2;
 }
 
 MachineBasicBlock::iterator
@@ -938,12 +962,13 @@ SILoadStoreOptimizer::mergeSBufferLoadIm
   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
   const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
 
-  BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
-      .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
-      .addImm(MergedOffset) // offset
-      .addImm(CI.GLC0)      // glc
-      .addImm(CI.DLC0)      // dlc
-      .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+  MachineInstr *New =
+    BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
+        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
+        .addImm(MergedOffset) // offset
+        .addImm(CI.GLC0)      // glc
+        .addImm(CI.DLC0)      // dlc
+        .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
 
   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@@ -963,10 +988,9 @@ SILoadStoreOptimizer::mergeSBufferLoadIm
 
   moveInstsAfter(Copy1, CI.InstsToMove);
 
-  MachineBasicBlock::iterator Next = std::next(CI.I);
   CI.I->eraseFromParent();
   CI.Paired->eraseFromParent();
-  return Next;
+  return New;
 }
 
 MachineBasicBlock::iterator
@@ -997,15 +1021,16 @@ SILoadStoreOptimizer::mergeBufferLoadPai
   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
   const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
 
-  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
-      .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
-      .addImm(MergedOffset) // offset
-      .addImm(CI.GLC0)      // glc
-      .addImm(CI.SLC0)      // slc
-      .addImm(0)            // tfe
-      .addImm(CI.DLC0)      // dlc
-      .addImm(0)            // swz
-      .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+  MachineInstr *New =
+    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
+        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
+        .addImm(MergedOffset) // offset
+        .addImm(CI.GLC0)      // glc
+        .addImm(CI.SLC0)      // slc
+        .addImm(0)            // tfe
+        .addImm(CI.DLC0)      // dlc
+        .addImm(0)            // swz
+        .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
 
   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@@ -1025,10 +1050,9 @@ SILoadStoreOptimizer::mergeBufferLoadPai
 
   moveInstsAfter(Copy1, CI.InstsToMove);
 
-  MachineBasicBlock::iterator Next = std::next(CI.I);
   CI.I->eraseFromParent();
   CI.Paired->eraseFromParent();
-  return Next;
+  return New;
 }
 
 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
@@ -1191,22 +1215,22 @@ SILoadStoreOptimizer::mergeBufferStorePa
   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
   const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
 
-  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
-      .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
-      .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
-      .addImm(CI.GLC0)      // glc
-      .addImm(CI.SLC0)      // slc
-      .addImm(0)            // tfe
-      .addImm(CI.DLC0)      // dlc
-      .addImm(0)            // swz
-      .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+  MachineInstr *New =
+    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
+        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
+        .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
+        .addImm(CI.GLC0)      // glc
+        .addImm(CI.SLC0)      // slc
+        .addImm(0)            // tfe
+        .addImm(CI.DLC0)      // dlc
+        .addImm(0)            // swz
+        .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
 
   moveInstsAfter(MIB, CI.InstsToMove);
 
-  MachineBasicBlock::iterator Next = std::next(CI.I);
   CI.I->eraseFromParent();
   CI.Paired->eraseFromParent();
-  return Next;
+  return New;
 }
 
 MachineOperand
@@ -1519,32 +1543,105 @@ bool SILoadStoreOptimizer::promoteConsta
   return false;
 }
 
-// Scan through looking for adjacent LDS operations with constant offsets from
-// the same base register. We rely on the scheduler to do the hard work of
-// clustering nearby loads, and assume these are all adjacent.
-bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
-  bool Modified = false;
+void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
+                 std::list<std::list<CombineInfo> > &MergeableInsts) const {
+  for (std::list<CombineInfo> &AddrList : MergeableInsts) {
+    if (AddrList.front().hasSameBaseAddress(*CI.I) &&
+        AddrList.front().InstClass == CI.InstClass) {
+      AddrList.emplace_back(CI);
+      return;
+    }
+  }
 
+  // Base address not found, so add a new list.
+  MergeableInsts.emplace_back(1, CI);
+}
+
+bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB,
+                 std::list<std::list<CombineInfo> > &MergeableInsts) const {
+  bool Modified = false;
   // Contain the list
   MemInfoMap Visited;
   // Contains the list of instructions for which constant offsets are being
   // promoted to the IMM.
   SmallPtrSet<MachineInstr *, 4> AnchorList;
 
-  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
-    MachineInstr &MI = *I;
-
+  // Sort potential mergeable instructions into lists.  One list per base address.
+  for (MachineInstr &MI : MBB.instrs()) {
+    // We run this before checking if an address is mergeable, because it can produce
+    // better code even if the instructions aren't mergeable.
     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
       Modified = true;
 
+    const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
+    if (InstClass == UNKNOWN)
+      continue;
+
     // Don't combine if volatile.
-    if (MI.hasOrderedMemoryRef()) {
-      ++I;
+    if (MI.hasOrderedMemoryRef())
       continue;
-    }
 
     CombineInfo CI;
-    CI.setMI(I, *TII, *STM);
+    CI.setMI(MI, *TII, *STM);
+
+    if (!CI.hasMergeableAddress(*MRI))
+      continue;
+
+    addInstToMergeableList(CI, MergeableInsts);
+  }
+  return Modified;
+}
+
+// Scan through looking for adjacent LDS operations with constant offsets from
+// the same base register. We rely on the scheduler to do the hard work of
+// clustering nearby loads, and assume these are all adjacent.
+bool SILoadStoreOptimizer::optimizeBlock(
+                       std::list<std::list<CombineInfo> > &MergeableInsts) {
+  bool Modified = false;
+
+  for (std::list<CombineInfo> &MergeList : MergeableInsts) {
+    if (MergeList.size() < 2)
+      continue;
+
+    bool OptimizeListAgain = false;
+    if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
+      // We weren't able to make any changes, so clear the list so we don't
+      // process the same instructions the next time we try to optimize this
+      // block.
+      MergeList.clear();
+      continue;
+    }
+
+    // We made changes, but also determined that there were no more optimization
+    // opportunities, so we don't need to reprocess the list
+    if (!OptimizeListAgain)
+      MergeList.clear();
+
+    OptimizeAgain |= OptimizeListAgain;
+    Modified = true;
+  }
+  return Modified;
+}
+
+void
+SILoadStoreOptimizer::removeCombinedInst(std::list<CombineInfo> &MergeList,
+                                         const MachineInstr &MI) {
+
+  for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) {
+    if (&*CI->I == &MI) {
+      MergeList.erase(CI);
+      return;
+    }
+  }
+}
+
+bool
+SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
+                                          std::list<CombineInfo> &MergeList,
+                                          bool &OptimizeListAgain) {
+  bool Modified = false;
+  for (auto I = MergeList.begin(); I != MergeList.end(); ++I) {
+    CombineInfo &CI = *I;
 
     switch (CI.InstClass) {
     default:
@@ -1552,55 +1649,57 @@ bool SILoadStoreOptimizer::optimizeBlock
     case DS_READ:
       if (findMatchingInst(CI)) {
         Modified = true;
-        I = mergeRead2Pair(CI);
-      } else {
-        ++I;
+        removeCombinedInst(MergeList, *CI.Paired);
+        MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI);
+        CI.setMI(NewMI, *TII, *STM);
       }
-      continue;
+      break;
     case DS_WRITE:
       if (findMatchingInst(CI)) {
         Modified = true;
-        I = mergeWrite2Pair(CI);
-      } else {
-        ++I;
+        removeCombinedInst(MergeList, *CI.Paired);
+        MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI);
+        CI.setMI(NewMI, *TII, *STM);
       }
-      continue;
+      break;
     case S_BUFFER_LOAD_IMM:
       if (findMatchingInst(CI)) {
         Modified = true;
-        I = mergeSBufferLoadImmPair(CI);
-        OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
-      } else {
-        ++I;
+        removeCombinedInst(MergeList, *CI.Paired);
+        MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI);
+        CI.setMI(NewMI, *TII, *STM);
+        OptimizeListAgain |= (CI.Width0 + CI.Width1) < 16;
       }
-      continue;
+      break;
     case BUFFER_LOAD_OFFEN:
     case BUFFER_LOAD_OFFSET:
     case BUFFER_LOAD_OFFEN_exact:
     case BUFFER_LOAD_OFFSET_exact:
       if (findMatchingInst(CI)) {
         Modified = true;
-        I = mergeBufferLoadPair(CI);
-        OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
-      } else {
-        ++I;
+        removeCombinedInst(MergeList, *CI.Paired);
+        MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI);
+        CI.setMI(NewMI, *TII, *STM);
+        OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
       }
-      continue;
+      break;
     case BUFFER_STORE_OFFEN:
     case BUFFER_STORE_OFFSET:
     case BUFFER_STORE_OFFEN_exact:
     case BUFFER_STORE_OFFSET_exact:
       if (findMatchingInst(CI)) {
         Modified = true;
-        I = mergeBufferStorePair(CI);
-        OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
-      } else {
-        ++I;
+        removeCombinedInst(MergeList, *CI.Paired);
+        MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI);
+        CI.setMI(NewMI, *TII, *STM);
+        OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
       }
-      continue;
+      break;
     }
-
-    ++I;
+    // Clear the InstsToMove after we have finished searching so we don't have
+    // stale values left over if we search for this CI again in another pass
+    // over the block.
+    CI.InstsToMove.clear();
   }
 
   return Modified;
@@ -1626,10 +1725,14 @@ bool SILoadStoreOptimizer::runOnMachineF
 
   bool Modified = false;
 
+
   for (MachineBasicBlock &MBB : MF) {
+    std::list<std::list<CombineInfo> > MergeableInsts;
+    // First pass: Collect list of all instructions we know how to merge.
+    Modified |= collectMergeableInsts(MBB, MergeableInsts);
     do {
       OptimizeAgain = false;
-      Modified |= optimizeBlock(MBB);
+      Modified |= optimizeBlock(MergeableInsts);
     } while (OptimizeAgain);
   }