[llvm] [AArch64][GlobalISel] Split offsets of consecutive stores to aid STP … (PR #66980)

Thu Sep 21 01:05:37 PDT 2023

================
@@ -492,7 +507,192 @@ bool AArch64PostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
                      F.hasMinSize());
   AArch64PostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo,
                                         RuleConfig, ST, MDT, LI);
-  return Impl.combineMachineInstrs();
+  bool Changed = Impl.combineMachineInstrs();
+
+  auto MIB = CSEMIRBuilder(MF);
+  MIB.setCSEInfo(CSEInfo);
+  Changed |= optimizeConsecutiveMemOpAddressing(MF, MIB);
+  return Changed;
+}
+
+bool AArch64PostLegalizerCombiner::tryOptimizeConsecStores(
+    SmallVectorImpl<StoreInfo> &Stores, CSEMIRBuilder &MIB) {
+  if (Stores.size() <= 2)
+    return false;
+
+  // Profitabity checks:
+  int64_t BaseOffset = Stores[0].Offset;
+  unsigned NumPairsExpected = Stores.size() / 2;
+  unsigned TotalInstsExpected = NumPairsExpected + (Stores.size() % 2);
+  // Size savings will depend on whether we can fold the offset, as an
+  // immediate of an ADD.
+  auto &TLI = *MIB.getMF().getSubtarget().getTargetLowering();
+  if (!TLI.isLegalAddImmediate(BaseOffset))
+    TotalInstsExpected++;
+  int SavingsExpected = Stores.size() - TotalInstsExpected;
+  if (SavingsExpected <= 0)
+    return false;
+
+  auto &MRI = MIB.getMF().getRegInfo();
+
+  // We have a series of consecutive stores. Factor out the common base
+  // pointer and rewrite the offsets.
+  Register NewBase = Stores[0].Ptr->getReg(0);
+  for (auto &SInfo : Stores) {
+    // Compute a new pointer with the new base ptr and adjusted offset.
+    MIB.setInstrAndDebugLoc(*SInfo.St);
+    auto NewOff = MIB.buildConstant(LLT::scalar(64), SInfo.Offset - BaseOffset);
+    auto NewPtr = MIB.buildPtrAdd(MRI.getType(SInfo.St->getPointerReg()),
+                                  NewBase, NewOff);
+    if (MIB.getObserver())
+      MIB.getObserver()->changingInstr(*SInfo.St);
+    SInfo.St->getOperand(1).setReg(NewPtr.getReg(0));
+    if (MIB.getObserver())
+      MIB.getObserver()->changedInstr(*SInfo.St);
+  }
+  LLVM_DEBUG(dbgs() << "Split a series of " << Stores.size()
+                    << " stores into a base pointer and offsets.\n");
+  return true;
+}
+
+static cl::opt<bool>
+    EnableConsecutiveMemOpOpt("aarch64-postlegalizer-consecutive-memops",
+                              cl::init(true), cl::Hidden,
+                              cl::desc("Enable consecutive memop optimization "
+                                       "in AArch64PostLegalizerCombiner"));
+
+bool AArch64PostLegalizerCombiner::optimizeConsecutiveMemOpAddressing(
+    MachineFunction &MF, CSEMIRBuilder &MIB) {
+  // This combine needs to run after all reassociations/folds on pointer
+  // addressing have been done, specifically those that combine two G_PTR_ADDs
+  // with constant offsets into a single G_PTR_ADD with a combined offset.
+  // The goal of this optimization is to undo that combine in the case where
+  // doing so has prevented the formation of pair stores due to illegal
+  // addressing modes of STP. The reason that we do it here is because
+  // it's much easier to undo the transformation of a series consecutive
+  // mem ops, than it is to detect when doing it would be a bad idea looking
+  // at a single G_PTR_ADD in the reassociation/ptradd_immed_chain combine.
+  //
+  // An example:
+  //   G_STORE %11:_(<2 x s64>), %base:_(p0) :: (store (<2 x s64>), align 1)
+  //   %off1:_(s64) = G_CONSTANT i64 4128
+  //   %p1:_(p0) = G_PTR_ADD %0:_, %off1:_(s64)
+  //   G_STORE %11:_(<2 x s64>), %p1:_(p0) :: (store (<2 x s64>), align 1)
+  //   %off2:_(s64) = G_CONSTANT i64 4144
+  //   %p2:_(p0) = G_PTR_ADD %0:_, %off2:_(s64)
+  //   G_STORE %11:_(<2 x s64>), %p2:_(p0) :: (store (<2 x s64>), align 1)
+  //   %off3:_(s64) = G_CONSTANT i64 4160
+  //   %p3:_(p0) = G_PTR_ADD %0:_, %off3:_(s64)
+  //   G_STORE %11:_(<2 x s64>), %17:_(p0) :: (store (<2 x s64>), align 1)
+  bool Changed = false;
+  auto &MRI = MF.getRegInfo();
+
+  if (!EnableConsecutiveMemOpOpt)
+    return Changed;
+
+  SmallVector<StoreInfo, 8> Stores;
+  // If we see a load, then we keep track of any values defined by it.
+  // In the following example, STP formation will fail anyway because
+  // the latter store is using a load result that appears after the
+  // the prior store. In this situation if we factor out the offset then
+  // we increase code size for no benefit.
+  SmallVector<Register> LoadValsSinceLastStore;
+
+  auto storeIsValid = [&](StoreInfo &Last, StoreInfo New) {
+    // Check if this store is consecutive to the last one.
+    if (Last.Ptr->getBaseReg() != New.Ptr->getBaseReg() ||
+        (Last.Offset + static_cast<int64_t>(Last.StoredType.getSizeInBytes()) !=
+         New.Offset) ||
+        Last.StoredType != New.StoredType)
+      return false;
+
+    // Check if this store is using a load result that appears after the
+    // last store. If so, bail out.
+    if (llvm::any_of(LoadValsSinceLastStore, [&](Register LoadVal) {
+          return New.St->getValueReg() == LoadVal;
+        }))
+      return false;
+
+    // Check if the current offset would be too large for STP.
+    // If not, then STP formation should be able to handle it, so we don't
+    // need to do anything.
+    int64_t MaxLegalOffset;
+    switch (New.StoredType.getSizeInBits()) {
+    case 32:
+      MaxLegalOffset = 252;
+      break;
+    case 64:
+      MaxLegalOffset = 504;
+      break;
+    case 128:
+      MaxLegalOffset = 1008;
+      break;
+    default:
+      llvm_unreachable("Unexpected stored type size");
+    }
+    if (New.Offset < MaxLegalOffset)
+      return false;
+
+    // If factoring it out still wouldn't help then don't bother.
+    return New.Offset - Stores[0].Offset <= MaxLegalOffset;
+  };
+
+  for (auto &MBB : MF) {
+    // We're looking inside a single BB at a time since the memset pattern
+    // should only be in a single block.
+
+    Stores.clear();
+    LoadValsSinceLastStore.clear();
+
+    auto resetState = [&]() {
+      Stores.clear();
+      LoadValsSinceLastStore.clear();
+    };
+
+    for (auto &MI : MBB) {
+      if (auto *St = dyn_cast<GStore>(&MI)) {
+        Register PtrBaseReg;
+        APInt Offset;
+        LLT StoredValTy = MRI.getType(St->getValueReg());
+        unsigned ValSize = StoredValTy.getSizeInBits();
+        if (ValSize != St->getMMO().getSizeInBits())
+          continue; // Don't handle truncating stores.
+        if (ValSize < 32)
+          continue; // Can only STP 32b or larger.
----------------
Pierre-vh wrote:

Nit: i'd merge those together, and check the ValSize < 32 first because it's the cheaper check of the two

https://github.com/llvm/llvm-project/pull/66980