[llvm] [AArch64][GlobalISel] Split offsets of consecutive stores to aid STP … (PR #66980)
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 21 01:05:37 PDT 2023
================
@@ -492,7 +507,192 @@ bool AArch64PostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
F.hasMinSize());
AArch64PostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo,
RuleConfig, ST, MDT, LI);
- return Impl.combineMachineInstrs();
+ bool Changed = Impl.combineMachineInstrs();
+
+ auto MIB = CSEMIRBuilder(MF);
+ MIB.setCSEInfo(CSEInfo);
+ Changed |= optimizeConsecutiveMemOpAddressing(MF, MIB);
+ return Changed;
+}
+
+bool AArch64PostLegalizerCombiner::tryOptimizeConsecStores(
+ SmallVectorImpl<StoreInfo> &Stores, CSEMIRBuilder &MIB) {
+ if (Stores.size() <= 2)
+ return false;
+
+ // Profitabity checks:
+ int64_t BaseOffset = Stores[0].Offset;
+ unsigned NumPairsExpected = Stores.size() / 2;
+ unsigned TotalInstsExpected = NumPairsExpected + (Stores.size() % 2);
+ // Size savings will depend on whether we can fold the offset, as an
+ // immediate of an ADD.
+ auto &TLI = *MIB.getMF().getSubtarget().getTargetLowering();
+ if (!TLI.isLegalAddImmediate(BaseOffset))
+ TotalInstsExpected++;
+ int SavingsExpected = Stores.size() - TotalInstsExpected;
+ if (SavingsExpected <= 0)
+ return false;
+
+ auto &MRI = MIB.getMF().getRegInfo();
+
+ // We have a series of consecutive stores. Factor out the common base
+ // pointer and rewrite the offsets.
+ Register NewBase = Stores[0].Ptr->getReg(0);
+ for (auto &SInfo : Stores) {
+ // Compute a new pointer with the new base ptr and adjusted offset.
+ MIB.setInstrAndDebugLoc(*SInfo.St);
+ auto NewOff = MIB.buildConstant(LLT::scalar(64), SInfo.Offset - BaseOffset);
+ auto NewPtr = MIB.buildPtrAdd(MRI.getType(SInfo.St->getPointerReg()),
+ NewBase, NewOff);
+ if (MIB.getObserver())
+ MIB.getObserver()->changingInstr(*SInfo.St);
+ SInfo.St->getOperand(1).setReg(NewPtr.getReg(0));
+ if (MIB.getObserver())
+ MIB.getObserver()->changedInstr(*SInfo.St);
+ }
+ LLVM_DEBUG(dbgs() << "Split a series of " << Stores.size()
+ << " stores into a base pointer and offsets.\n");
+ return true;
+}
+
+static cl::opt<bool>
+ EnableConsecutiveMemOpOpt("aarch64-postlegalizer-consecutive-memops",
+ cl::init(true), cl::Hidden,
+ cl::desc("Enable consecutive memop optimization "
+ "in AArch64PostLegalizerCombiner"));
+
+bool AArch64PostLegalizerCombiner::optimizeConsecutiveMemOpAddressing(
+ MachineFunction &MF, CSEMIRBuilder &MIB) {
+ // This combine needs to run after all reassociations/folds on pointer
+ // addressing have been done, specifically those that combine two G_PTR_ADDs
+ // with constant offsets into a single G_PTR_ADD with a combined offset.
+ // The goal of this optimization is to undo that combine in the case where
+ // doing so has prevented the formation of pair stores due to illegal
+ // addressing modes of STP. The reason that we do it here is because
+ // it's much easier to undo the transformation of a series consecutive
+ // mem ops, than it is to detect when doing it would be a bad idea looking
+ // at a single G_PTR_ADD in the reassociation/ptradd_immed_chain combine.
+ //
+ // An example:
+ // G_STORE %11:_(<2 x s64>), %base:_(p0) :: (store (<2 x s64>), align 1)
+ // %off1:_(s64) = G_CONSTANT i64 4128
+ // %p1:_(p0) = G_PTR_ADD %0:_, %off1:_(s64)
+ // G_STORE %11:_(<2 x s64>), %p1:_(p0) :: (store (<2 x s64>), align 1)
+ // %off2:_(s64) = G_CONSTANT i64 4144
+ // %p2:_(p0) = G_PTR_ADD %0:_, %off2:_(s64)
+ // G_STORE %11:_(<2 x s64>), %p2:_(p0) :: (store (<2 x s64>), align 1)
+ // %off3:_(s64) = G_CONSTANT i64 4160
+ // %p3:_(p0) = G_PTR_ADD %0:_, %off3:_(s64)
+ // G_STORE %11:_(<2 x s64>), %17:_(p0) :: (store (<2 x s64>), align 1)
+ bool Changed = false;
+ auto &MRI = MF.getRegInfo();
+
+ if (!EnableConsecutiveMemOpOpt)
+ return Changed;
+
+ SmallVector<StoreInfo, 8> Stores;
+ // If we see a load, then we keep track of any values defined by it.
+ // In the following example, STP formation will fail anyway because
+ // the latter store is using a load result that appears after the
+ // the prior store. In this situation if we factor out the offset then
+ // we increase code size for no benefit.
+ SmallVector<Register> LoadValsSinceLastStore;
+
+ auto storeIsValid = [&](StoreInfo &Last, StoreInfo New) {
+ // Check if this store is consecutive to the last one.
+ if (Last.Ptr->getBaseReg() != New.Ptr->getBaseReg() ||
+ (Last.Offset + static_cast<int64_t>(Last.StoredType.getSizeInBytes()) !=
+ New.Offset) ||
+ Last.StoredType != New.StoredType)
+ return false;
+
+ // Check if this store is using a load result that appears after the
+ // last store. If so, bail out.
+ if (llvm::any_of(LoadValsSinceLastStore, [&](Register LoadVal) {
+ return New.St->getValueReg() == LoadVal;
+ }))
+ return false;
+
+ // Check if the current offset would be too large for STP.
+ // If not, then STP formation should be able to handle it, so we don't
+ // need to do anything.
+ int64_t MaxLegalOffset;
+ switch (New.StoredType.getSizeInBits()) {
+ case 32:
+ MaxLegalOffset = 252;
+ break;
+ case 64:
+ MaxLegalOffset = 504;
+ break;
+ case 128:
+ MaxLegalOffset = 1008;
+ break;
+ default:
+ llvm_unreachable("Unexpected stored type size");
+ }
+ if (New.Offset < MaxLegalOffset)
+ return false;
+
+ // If factoring it out still wouldn't help then don't bother.
+ return New.Offset - Stores[0].Offset <= MaxLegalOffset;
+ };
+
+ for (auto &MBB : MF) {
+ // We're looking inside a single BB at a time since the memset pattern
+ // should only be in a single block.
+
+ Stores.clear();
+ LoadValsSinceLastStore.clear();
+
+ auto resetState = [&]() {
+ Stores.clear();
+ LoadValsSinceLastStore.clear();
+ };
+
+ for (auto &MI : MBB) {
+ if (auto *St = dyn_cast<GStore>(&MI)) {
+ Register PtrBaseReg;
+ APInt Offset;
+ LLT StoredValTy = MRI.getType(St->getValueReg());
+ unsigned ValSize = StoredValTy.getSizeInBits();
+ if (ValSize != St->getMMO().getSizeInBits())
+ continue; // Don't handle truncating stores.
+ if (ValSize < 32)
+ continue; // Can only STP 32b or larger.
----------------
Pierre-vh wrote:
Nit: i'd merge those together, and check the ValSize < 32 first because it's the cheaper check of the two
https://github.com/llvm/llvm-project/pull/66980
More information about the llvm-commits
mailing list