[llvm] [AArch64][GlobalISel] Split offsets of consecutive stores to aid STP … (PR #66980)

Amara Emerson via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 20 23:12:06 PDT 2023


https://github.com/aemerson created https://github.com/llvm/llvm-project/pull/66980

…formation.

The goal of this optimization is to undo reassociation/offset folding in the case where doing so has prevented the formation of pair stores due to illegal addressing modes of STP.

Specifically, this can happen because reassociation believes that combining two offsets into a single larger store offset doesn't break the addressing mode. While this can be true, if the store was part of a consecutive pattern which could be merged with others to form STPs, then this can result in worse codegen. STP has a smaller immediate range than regular STRs, so they can no longer be formed.

This optimization runs after all reassociations have run, and does a fast linear scan of the function for opportunities to factor out constants from STP candidate store offsets.

Code size improvements at -Os CTMark:
```
Program                                       size.__text
                                              before         after           diff
SPASS/SPASS                                   410616.00      410616.00       0.0%
kimwitu++/kc                                  453636.00      453636.00       0.0%
sqlite3/sqlite3                               287832.00      287832.00       0.0%
tramp3d-v4/tramp3d-v4                         393808.00      393808.00       0.0%
lencod/lencod                                 428060.00      428052.00      -0.0%
7zip/7zip-benchmark                           593512.00      593476.00      -0.0%
consumer-typeset/consumer-typeset             412220.00      412180.00      -0.0%
Bullet/bullet                                 461144.00      460940.00      -0.0%
ClamAV/clamscan                               381964.00      381624.00      -0.1%
mafft/pairlocalalign                          244284.00      243776.00      -0.2%
                           Geomean difference                               -0.0%
```

>From b72168631e05eba1847b32b929882dc878d245ff Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara at apple.com>
Date: Wed, 20 Sep 2023 00:33:17 -0700
Subject: [PATCH] [AArch64][GlobalISel] Split offsets of consecutive stores to
 aid STP formation.

The goal of this optimization is to undo reassociation/offset folding in the case where
doing so has prevented the formation of pair stores due to illegal addressing
modes of STP.

Specifically, this can happen because reassociation believes that combining two
offsets into a single larger store offset doesn't break the addressing mode. While
this can be true, if the store was part of a consecutive pattern which could be
merged with others to form STPs, then this can result in worse codegen. STP
has a smaller immediate range than regular STRs, so they can no longer be formed.

This optimization runs after all reassociations have run, and does a fast linear
scan of the function for opportunities to factor out constants from STP candidate
store offsets.

Code size improvements at -Os CTMark:
Program                                       size.__text
                                              before         after           diff
SPASS/SPASS                                   410616.00      410616.00       0.0%
kimwitu++/kc                                  453636.00      453636.00       0.0%
sqlite3/sqlite3                               287832.00      287832.00       0.0%
tramp3d-v4/tramp3d-v4                         393808.00      393808.00       0.0%
lencod/lencod                                 428060.00      428052.00      -0.0%
7zip/7zip-benchmark                           593512.00      593476.00      -0.0%
consumer-typeset/consumer-typeset             412220.00      412180.00      -0.0%
Bullet/bullet                                 461144.00      460940.00      -0.0%
ClamAV/clamscan                               381964.00      381624.00      -0.1%
mafft/pairlocalalign                          244284.00      243776.00      -0.2%
                           Geomean difference                               -0.0%
---
 .../CodeGen/GlobalISel/MachineIRBuilder.h     |   2 +
 .../GISel/AArch64PostLegalizerCombiner.cpp    | 202 +++++++++-
 .../GlobalISel/split-offsets-for-stp.ll       | 353 ++++++++++++++++++
 3 files changed, 556 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/split-offsets-for-stp.ll

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index e7db9547f03b694..e8e61b73f9e0c43 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -364,6 +364,8 @@ class MachineIRBuilder {
     State.Observer = &Observer;
   }
 
+  GISelChangeObserver *getObserver() { return State.Observer; }
+
   void stopObservingChanges() { State.Observer = nullptr; }
 
   bool isObservingChanges() const { return State.Observer != nullptr; }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
index 56e564638cdcafd..3aae945a06dbc41 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -20,7 +20,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64TargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
+#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/Combiner.h"
 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
@@ -439,6 +441,19 @@ class AArch64PostLegalizerCombiner : public MachineFunctionPass {
 private:
   bool IsOptNone;
   AArch64PostLegalizerCombinerImplRuleConfig RuleConfig;
+
+
+  struct StoreInfo {
+    GStore *St;
+    GPtrAdd *Ptr;
+    int64_t Offset;
+    LLT StoredType;
+  };
+  bool tryOptimizeConsecStores(SmallVectorImpl<StoreInfo> &Stores,
+                               CSEMIRBuilder &MIB);
+
+  bool optimizeConsecutiveMemOpAddressing(MachineFunction &MF,
+                                          CSEMIRBuilder &MIB);
 };
 } // end anonymous namespace
 
@@ -492,7 +507,192 @@ bool AArch64PostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
                      F.hasMinSize());
   AArch64PostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo,
                                         RuleConfig, ST, MDT, LI);
-  return Impl.combineMachineInstrs();
+  bool Changed = Impl.combineMachineInstrs();
+
+  auto MIB = CSEMIRBuilder(MF);
+  MIB.setCSEInfo(CSEInfo);
+  Changed |= optimizeConsecutiveMemOpAddressing(MF, MIB);
+  return Changed;
+}
+
+bool AArch64PostLegalizerCombiner::tryOptimizeConsecStores(
+    SmallVectorImpl<StoreInfo> &Stores, CSEMIRBuilder &MIB) {
+  if (Stores.size() <= 2)
+    return false;
+
+  // Profitabity checks:
+  int64_t BaseOffset = Stores[0].Offset;
+  unsigned NumPairsExpected = Stores.size() / 2;
+  unsigned TotalInstsExpected = NumPairsExpected + (Stores.size() % 2);
+  // Size savings will depend on whether we can fold the offset, as an
+  // immediate of an ADD.
+  auto &TLI = *MIB.getMF().getSubtarget().getTargetLowering();
+  if (!TLI.isLegalAddImmediate(BaseOffset))
+    TotalInstsExpected++;
+  int SavingsExpected = Stores.size() - TotalInstsExpected;
+  if (SavingsExpected <= 0)
+    return false;
+
+  auto &MRI = MIB.getMF().getRegInfo();
+
+  // We have a series of consecutive stores. Factor out the common base
+  // pointer and rewrite the offsets.
+  Register NewBase = Stores[0].Ptr->getReg(0);
+  for (auto &SInfo : Stores) {
+    // Compute a new pointer with the new base ptr and adjusted offset.
+    MIB.setInstrAndDebugLoc(*SInfo.St);
+    auto NewOff = MIB.buildConstant(LLT::scalar(64), SInfo.Offset - BaseOffset);
+    auto NewPtr = MIB.buildPtrAdd(MRI.getType(SInfo.St->getPointerReg()),
+                                  NewBase, NewOff);
+    if (MIB.getObserver())
+      MIB.getObserver()->changingInstr(*SInfo.St);
+    SInfo.St->getOperand(1).setReg(NewPtr.getReg(0));
+    if (MIB.getObserver())
+      MIB.getObserver()->changedInstr(*SInfo.St);
+  }
+  LLVM_DEBUG(dbgs() << "Split a series of " << Stores.size()
+                    << " stores into a base pointer and offsets.\n");
+  return true;
+}
+
+static cl::opt<bool>
+    EnableConsecutiveMemOpOpt("aarch64-postlegalizer-consecutive-memops",
+                              cl::init(true), cl::Hidden,
+                              cl::desc("Enable consecutive memop optimization "
+                                       "in AArch64PostLegalizerCombiner"));
+
+bool AArch64PostLegalizerCombiner::optimizeConsecutiveMemOpAddressing(
+    MachineFunction &MF, CSEMIRBuilder &MIB) {
+  // This combine needs to run after all reassociations/folds on pointer
+  // addressing have been done, specifically those that combine two G_PTR_ADDs
+  // with constant offsets into a single G_PTR_ADD with a combined offset.
+  // The goal of this optimization is to undo that combine in the case where
+  // doing so has prevented the formation of pair stores due to illegal
+  // addressing modes of STP. The reason that we do it here is because
+  // it's much easier to undo the transformation of a series consecutive
+  // mem ops, than it is to detect when doing it would be a bad idea looking
+  // at a single G_PTR_ADD in the reassociation/ptradd_immed_chain combine.
+  //
+  // An example:
+  //   G_STORE %11:_(<2 x s64>), %base:_(p0) :: (store (<2 x s64>), align 1)
+  //   %off1:_(s64) = G_CONSTANT i64 4128
+  //   %p1:_(p0) = G_PTR_ADD %0:_, %off1:_(s64)
+  //   G_STORE %11:_(<2 x s64>), %p1:_(p0) :: (store (<2 x s64>), align 1)
+  //   %off2:_(s64) = G_CONSTANT i64 4144
+  //   %p2:_(p0) = G_PTR_ADD %0:_, %off2:_(s64)
+  //   G_STORE %11:_(<2 x s64>), %p2:_(p0) :: (store (<2 x s64>), align 1)
+  //   %off3:_(s64) = G_CONSTANT i64 4160
+  //   %p3:_(p0) = G_PTR_ADD %0:_, %off3:_(s64)
+  //   G_STORE %11:_(<2 x s64>), %17:_(p0) :: (store (<2 x s64>), align 1)
+  bool Changed = false;
+  auto &MRI = MF.getRegInfo();
+
+  if (!EnableConsecutiveMemOpOpt)
+    return Changed;
+
+  SmallVector<StoreInfo, 8> Stores;
+  // If we see a load, then we keep track of any values defined by it.
+  // In the following example, STP formation will fail anyway because
+  // the latter store is using a load result that appears after the
+  // the prior store. In this situation if we factor out the offset then
+  // we increase code size for no benefit.
+  SmallVector<Register> LoadValsSinceLastStore;
+
+  auto storeIsValid = [&](StoreInfo &Last, StoreInfo New) {
+    // Check if this store is consecutive to the last one.
+    if (Last.Ptr->getBaseReg() != New.Ptr->getBaseReg() ||
+        (Last.Offset + static_cast<int64_t>(Last.StoredType.getSizeInBytes()) !=
+         New.Offset) ||
+        Last.StoredType != New.StoredType)
+      return false;
+
+    // Check if this store is using a load result that appears after the
+    // last store. If so, bail out.
+    if (llvm::any_of(LoadValsSinceLastStore, [&](Register LoadVal) {
+          return New.St->getValueReg() == LoadVal;
+        }))
+      return false;
+
+    // Check if the current offset would be too large for STP.
+    // If not, then STP formation should be able to handle it, so we don't
+    // need to do anything.
+    int64_t MaxLegalOffset;
+    switch (New.StoredType.getSizeInBits()) {
+    case 32:
+      MaxLegalOffset = 252;
+      break;
+    case 64:
+      MaxLegalOffset = 504;
+      break;
+    case 128:
+      MaxLegalOffset = 1008;
+      break;
+    default:
+      llvm_unreachable("Unexpected stored type size");
+    }
+    if (New.Offset < MaxLegalOffset)
+      return false;
+
+    // If factoring it out still wouldn't help then don't bother.
+    return New.Offset - Stores[0].Offset <= MaxLegalOffset;
+  };
+
+  for (auto &MBB : MF) {
+    // We're looking inside a single BB at a time since the memset pattern
+    // should only be in a single block.
+
+    Stores.clear();
+    LoadValsSinceLastStore.clear();
+
+    auto resetState = [&]() {
+      Stores.clear();
+      LoadValsSinceLastStore.clear();
+    };
+
+    for (auto &MI : MBB) {
+      if (auto *St = dyn_cast<GStore>(&MI)) {
+        Register PtrBaseReg;
+        APInt Offset;
+        LLT StoredValTy = MRI.getType(St->getValueReg());
+        unsigned ValSize = StoredValTy.getSizeInBits();
+        if (ValSize != St->getMMO().getSizeInBits())
+          continue; // Don't handle truncating stores.
+        if (ValSize < 32)
+          continue; // Can only STP 32b or larger.
+
+        if (mi_match(
+                St->getPointerReg(), MRI,
+                m_OneNonDBGUse(m_GPtrAdd(m_Reg(PtrBaseReg), m_ICst(Offset))))) {
+          GPtrAdd *PtrAdd = cast<GPtrAdd>(MRI.getVRegDef(St->getPointerReg()));
+          StoreInfo New = {St, PtrAdd, Offset.getSExtValue(), StoredValTy};
+
+          if (Stores.empty()) {
+            Stores.push_back(New);
+            continue;
+          }
+
+          // Check if this store is consecutive to the last one.
+          auto &Last = Stores.back();
+          if (storeIsValid(Last, New)) {
+            Stores.push_back(New);
+            LoadValsSinceLastStore.clear(); // Reset the load value tracking.
+          } else {
+            // The store isn't a valid to consider for the prior sequence,
+            // so try to optimize what we have so far and start a new sequence.
+            Changed |= tryOptimizeConsecStores(Stores, MIB);
+            resetState();
+            Stores.push_back(New);
+          }
+        }
+      } else if (auto *Ld = dyn_cast<GLoad>(&MI)) {
+        LoadValsSinceLastStore.push_back(Ld->getDstReg());
+      }
+    }
+    Changed |= tryOptimizeConsecStores(Stores, MIB);
+    resetState();
+  }
+
+  return Changed;
 }
 
 char AArch64PostLegalizerCombiner::ID = 0;
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/split-offsets-for-stp.ll b/llvm/test/CodeGen/AArch64/GlobalISel/split-offsets-for-stp.ll
new file mode 100644
index 000000000000000..c38ead59b508d03
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/split-offsets-for-stp.ll
@@ -0,0 +1,353 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64-apple-ios -verify-machineinstrs -global-isel -aarch64-postlegalizer-consecutive-memops=0 < %s | FileCheck %s --check-prefix=CHECK-NO-SPLIT
+; RUN: llc -mtriple=aarch64-apple-ios -verify-machineinstrs -global-isel < %s | FileCheck %s --check-prefix=CHECK-SPLIT
+
+define void @basic_split(ptr %p) {
+; CHECK-NO-SPLIT-LABEL: basic_split:
+; CHECK-NO-SPLIT:       ; %bb.0:
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8000]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8008]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8016]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8024]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8032]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8040]
+; CHECK-NO-SPLIT-NEXT:    ret
+;
+; CHECK-SPLIT-LABEL: basic_split:
+; CHECK-SPLIT:       ; %bb.0:
+; CHECK-SPLIT-NEXT:    mov w8, #8000 ; =0x1f40
+; CHECK-SPLIT-NEXT:    add x8, x0, x8
+; CHECK-SPLIT-NEXT:    stp xzr, xzr, [x8]
+; CHECK-SPLIT-NEXT:    stp xzr, xzr, [x8, #16]
+; CHECK-SPLIT-NEXT:    stp xzr, xzr, [x8, #32]
+; CHECK-SPLIT-NEXT:    ret
+  %bigoffset = getelementptr i64, ptr %p, i64 1000
+  store i64 0, ptr %bigoffset
+  %addr2 = getelementptr i64, ptr %p, i64 1001
+  store i64 0, ptr %addr2
+  %addr3 = getelementptr i64, ptr %p, i64 1002
+  store i64 0, ptr %addr3
+  %addr4 = getelementptr i64, ptr %p, i64 1003
+  store i64 0, ptr %addr4
+  %addr5 = getelementptr i64, ptr %p, i64 1004
+  store i64 0, ptr %addr5
+  %addr6 = getelementptr i64, ptr %p, i64 1005
+  store i64 0, ptr %addr6
+  ret void
+}
+
+define void @basic_multi_use_ptr(ptr %p, ptr %p2) {
+; CHECK-NO-SPLIT-LABEL: basic_multi_use_ptr:
+; CHECK-NO-SPLIT:       ; %bb.0:
+; CHECK-NO-SPLIT-NEXT:    mov w8, #8008 ; =0x1f48
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8000]
+; CHECK-NO-SPLIT-NEXT:    add x8, x0, x8
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8008]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8016]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8024]
+; CHECK-NO-SPLIT-NEXT:    str x8, [x1]
+; CHECK-NO-SPLIT-NEXT:    ret
+;
+; CHECK-SPLIT-LABEL: basic_multi_use_ptr:
+; CHECK-SPLIT:       ; %bb.0:
+; CHECK-SPLIT-NEXT:    mov w8, #8008 ; =0x1f48
+; CHECK-SPLIT-NEXT:    str xzr, [x0, #8000]
+; CHECK-SPLIT-NEXT:    add x8, x0, x8
+; CHECK-SPLIT-NEXT:    str xzr, [x0, #8008]
+; CHECK-SPLIT-NEXT:    str xzr, [x0, #8016]
+; CHECK-SPLIT-NEXT:    str xzr, [x0, #8024]
+; CHECK-SPLIT-NEXT:    str x8, [x1]
+; CHECK-SPLIT-NEXT:    ret
+  %bigoffset = getelementptr i64, ptr %p, i64 1000
+  store i64 0, ptr %bigoffset
+  %addr2 = getelementptr i64, ptr %p, i64 1001
+  store i64 0, ptr %addr2
+  %addr3 = getelementptr i64, ptr %p, i64 1002
+  store i64 0, ptr %addr3
+  %addr4 = getelementptr i64, ptr %p, i64 1003
+  store i64 0, ptr %addr4
+  ; multiuse of %addr2
+  store ptr %addr2, ptr %p2
+  ret void
+}
+
+define void @not_consecutive(ptr %p) {
+; CHECK-NO-SPLIT-LABEL: not_consecutive:
+; CHECK-NO-SPLIT:       ; %bb.0:
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8000]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8008]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8024]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8032]
+; CHECK-NO-SPLIT-NEXT:    ret
+;
+; CHECK-SPLIT-LABEL: not_consecutive:
+; CHECK-SPLIT:       ; %bb.0:
+; CHECK-SPLIT-NEXT:    str xzr, [x0, #8000]
+; CHECK-SPLIT-NEXT:    str xzr, [x0, #8008]
+; CHECK-SPLIT-NEXT:    str xzr, [x0, #8024]
+; CHECK-SPLIT-NEXT:    str xzr, [x0, #8032]
+; CHECK-SPLIT-NEXT:    ret
+  %bigoffset = getelementptr i64, ptr %p, i64 1000
+  store i64 0, ptr %bigoffset
+  %addr2 = getelementptr i64, ptr %p, i64 1001
+  store i64 0, ptr %addr2
+  %addr3 = getelementptr i64, ptr %p, i64 1003
+  store i64 0, ptr %addr3
+  %addr4 = getelementptr i64, ptr %p, i64 1004
+  store i64 0, ptr %addr4
+  ret void
+}
+
+define void @early_store_is_invalid_but_split_rest(ptr %p) {
+; CHECK-NO-SPLIT-LABEL: early_store_is_invalid_but_split_rest:
+; CHECK-NO-SPLIT:       ; %bb.0:
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8000]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8080]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8016]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8024]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8032]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8040]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8048]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8056]
+; CHECK-NO-SPLIT-NEXT:    ret
+;
+; CHECK-SPLIT-LABEL: early_store_is_invalid_but_split_rest:
+; CHECK-SPLIT:       ; %bb.0:
+; CHECK-SPLIT-NEXT:    mov w8, #8016 ; =0x1f50
+; CHECK-SPLIT-NEXT:    str xzr, [x0, #8000]
+; CHECK-SPLIT-NEXT:    add x8, x0, x8
+; CHECK-SPLIT-NEXT:    str xzr, [x0, #8080]
+; CHECK-SPLIT-NEXT:    stp xzr, xzr, [x8]
+; CHECK-SPLIT-NEXT:    stp xzr, xzr, [x8, #16]
+; CHECK-SPLIT-NEXT:    stp xzr, xzr, [x8, #32]
+; CHECK-SPLIT-NEXT:    ret
+  %bigoffset = getelementptr i64, ptr %p, i64 1000
+  store i64 0, ptr %bigoffset
+  %addr2 = getelementptr i64, ptr %p, i64 1010
+  store i64 0, ptr %addr2
+  %addr3 = getelementptr i64, ptr %p, i64 1002
+  store i64 0, ptr %addr3
+  %addr4 = getelementptr i64, ptr %p, i64 1003
+  store i64 0, ptr %addr4
+  %addr5 = getelementptr i64, ptr %p, i64 1004
+  store i64 0, ptr %addr5
+  %addr6 = getelementptr i64, ptr %p, i64 1005
+  store i64 0, ptr %addr6
+  %addr7 = getelementptr i64, ptr %p, i64 1006
+  store i64 0, ptr %addr7
+  %addr8 = getelementptr i64, ptr %p, i64 1007
+  store i64 0, ptr %addr8
+  ret void
+}
+
+define void @vector(ptr %p) {
+; CHECK-NO-SPLIT-LABEL: vector:
+; CHECK-NO-SPLIT:       ; %bb.0:
+; CHECK-NO-SPLIT-NEXT:    movi.2d v0, #0000000000000000
+; CHECK-NO-SPLIT-NEXT:    str q0, [x0, #16000]
+; CHECK-NO-SPLIT-NEXT:    str q0, [x0, #16016]
+; CHECK-NO-SPLIT-NEXT:    str q0, [x0, #16032]
+; CHECK-NO-SPLIT-NEXT:    str q0, [x0, #16048]
+; CHECK-NO-SPLIT-NEXT:    str q0, [x0, #16064]
+; CHECK-NO-SPLIT-NEXT:    str q0, [x0, #16080]
+; CHECK-NO-SPLIT-NEXT:    str q0, [x0, #16096]
+; CHECK-NO-SPLIT-NEXT:    str q0, [x0, #16112]
+; CHECK-NO-SPLIT-NEXT:    ret
+;
+; CHECK-SPLIT-LABEL: vector:
+; CHECK-SPLIT:       ; %bb.0:
+; CHECK-SPLIT-NEXT:    movi.2d v0, #0000000000000000
+; CHECK-SPLIT-NEXT:    mov w8, #16000 ; =0x3e80
+; CHECK-SPLIT-NEXT:    add x8, x0, x8
+; CHECK-SPLIT-NEXT:    stp q0, q0, [x8]
+; CHECK-SPLIT-NEXT:    stp q0, q0, [x8, #32]
+; CHECK-SPLIT-NEXT:    stp q0, q0, [x8, #64]
+; CHECK-SPLIT-NEXT:    stp q0, q0, [x8, #96]
+; CHECK-SPLIT-NEXT:    ret
+  %bigoffset = getelementptr <2 x i64>, ptr %p, i64 1000
+  store <2 x i64> <i64 0, i64 0>, ptr %bigoffset
+  %addr2 = getelementptr <2 x i64>, ptr %p, i64 1001
+  store <2 x i64> <i64 0, i64 0>, ptr %addr2
+  %addr3 = getelementptr <2 x i64>, ptr %p, i64 1002
+  store <2 x i64> <i64 0, i64 0>, ptr %addr3
+  %addr4 = getelementptr <2 x i64>, ptr %p, i64 1003
+  store <2 x i64> <i64 0, i64 0>, ptr %addr4
+  %addr5 = getelementptr <2 x i64>, ptr %p, i64 1004
+  store <2 x i64> <i64 0, i64 0>, ptr %addr5
+  %addr6 = getelementptr <2 x i64>, ptr %p, i64 1005
+  store <2 x i64> <i64 0, i64 0>, ptr %addr6
+  %addr7 = getelementptr <2 x i64>, ptr %p, i64 1006
+  store <2 x i64> <i64 0, i64 0>, ptr %addr7
+  %addr8 = getelementptr <2 x i64>, ptr %p, i64 1007
+  store <2 x i64> <i64 0, i64 0>, ptr %addr8
+  ret void
+}
+
+define void @can_already_form_stp(ptr %p) {
+; CHECK-NO-SPLIT-LABEL: can_already_form_stp:
+; CHECK-NO-SPLIT:       ; %bb.0:
+; CHECK-NO-SPLIT-NEXT:    stp xzr, xzr, [x0, #80]
+; CHECK-NO-SPLIT-NEXT:    stp xzr, xzr, [x0, #96]
+; CHECK-NO-SPLIT-NEXT:    stp xzr, xzr, [x0, #112]
+; CHECK-NO-SPLIT-NEXT:    ret
+;
+; CHECK-SPLIT-LABEL: can_already_form_stp:
+; CHECK-SPLIT:       ; %bb.0:
+; CHECK-SPLIT-NEXT:    stp xzr, xzr, [x0, #80]
+; CHECK-SPLIT-NEXT:    stp xzr, xzr, [x0, #96]
+; CHECK-SPLIT-NEXT:    stp xzr, xzr, [x0, #112]
+; CHECK-SPLIT-NEXT:    ret
+  %bigoffset = getelementptr i64, ptr %p, i64 10
+  store i64 0, ptr %bigoffset
+  %addr2 = getelementptr i64, ptr %p, i64 11
+  store i64 0, ptr %addr2
+  %addr3 = getelementptr i64, ptr %p, i64 12
+  store i64 0, ptr %addr3
+  %addr4 = getelementptr i64, ptr %p, i64 13
+  store i64 0, ptr %addr4
+  %addr5 = getelementptr i64, ptr %p, i64 14
+  store i64 0, ptr %addr5
+  %addr6 = getelementptr i64, ptr %p, i64 15
+  store i64 0, ptr %addr6
+  ret void
+}
+
+define void @use_of_load_in_between(ptr %p, ptr %ldptr, ptr %ldptr2) {
+; CHECK-NO-SPLIT-LABEL: use_of_load_in_between:
+; CHECK-NO-SPLIT:       ; %bb.0:
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8000]
+; CHECK-NO-SPLIT-NEXT:    ldr x8, [x1]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8016]
+; CHECK-NO-SPLIT-NEXT:    str x8, [x0, #8008]
+; CHECK-NO-SPLIT-NEXT:    ldr x8, [x2]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8032]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8040]
+; CHECK-NO-SPLIT-NEXT:    str x8, [x0, #8024]
+; CHECK-NO-SPLIT-NEXT:    ret
+;
+; CHECK-SPLIT-LABEL: use_of_load_in_between:
+; CHECK-SPLIT:       ; %bb.0:
+; CHECK-SPLIT-NEXT:    str xzr, [x0, #8000]
+; CHECK-SPLIT-NEXT:    ldr x8, [x1]
+; CHECK-SPLIT-NEXT:    str xzr, [x0, #8016]
+; CHECK-SPLIT-NEXT:    str x8, [x0, #8008]
+; CHECK-SPLIT-NEXT:    ldr x8, [x2]
+; CHECK-SPLIT-NEXT:    str xzr, [x0, #8032]
+; CHECK-SPLIT-NEXT:    str xzr, [x0, #8040]
+; CHECK-SPLIT-NEXT:    str x8, [x0, #8024]
+; CHECK-SPLIT-NEXT:    ret
+  %bigoffset = getelementptr i64, ptr %p, i64 1000
+  store i64 0, ptr %bigoffset
+  %addr2 = getelementptr i64, ptr %p, i64 1001
+  %ld = load i64, ptr %ldptr
+  store i64 %ld, ptr %addr2
+  %addr3 = getelementptr i64, ptr %p, i64 1002
+  store i64 0, ptr %addr3
+  %addr4 = getelementptr i64, ptr %p, i64 1003
+  %ld2 = load i64, ptr %ldptr2
+  store i64 %ld2, ptr %addr4
+  %addr5 = getelementptr i64, ptr %p, i64 1004
+  store i64 0, ptr %addr5
+  %addr6 = getelementptr i64, ptr %p, i64 1005
+  store i64 0, ptr %addr6
+  ret void
+}
+
+define void @offset_legal_for_add_imm(ptr %p) {
+; CHECK-NO-SPLIT-LABEL: offset_legal_for_add_imm:
+; CHECK-NO-SPLIT:       ; %bb.0:
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #3200]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #3208]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #3216]
+; CHECK-NO-SPLIT-NEXT:    ret
+;
+; CHECK-SPLIT-LABEL: offset_legal_for_add_imm:
+; CHECK-SPLIT:       ; %bb.0:
+; CHECK-SPLIT-NEXT:    str xzr, [x0, #3200]
+; CHECK-SPLIT-NEXT:    str xzr, [x0, #3208]
+; CHECK-SPLIT-NEXT:    str xzr, [x0, #3216]
+; CHECK-SPLIT-NEXT:    ret
+  %bigoffset = getelementptr i64, ptr %p, i64 400
+  store i64 0, ptr %bigoffset
+  %addr2 = getelementptr i64, ptr %p, i64 401
+  store i64 0, ptr %addr2
+  %addr3 = getelementptr i64, ptr %p, i64 402
+  store i64 0, ptr %addr3
+  ret void
+}
+
+define void @offset_illegal_for_add_imm(ptr %p) {
+; CHECK-NO-SPLIT-LABEL: offset_illegal_for_add_imm:
+; CHECK-NO-SPLIT:       ; %bb.0:
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8000]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8008]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8016]
+; CHECK-NO-SPLIT-NEXT:    ret
+;
+; CHECK-SPLIT-LABEL: offset_illegal_for_add_imm:
+; CHECK-SPLIT:       ; %bb.0:
+; CHECK-SPLIT-NEXT:    str xzr, [x0, #8000]
+; CHECK-SPLIT-NEXT:    str xzr, [x0, #8008]
+; CHECK-SPLIT-NEXT:    str xzr, [x0, #8016]
+; CHECK-SPLIT-NEXT:    ret
+  %bigoffset = getelementptr i64, ptr %p, i64 1000
+  store i64 0, ptr %bigoffset
+  %addr2 = getelementptr i64, ptr %p, i64 1001
+  store i64 0, ptr %addr2
+  %addr3 = getelementptr i64, ptr %p, i64 1002
+  store i64 0, ptr %addr3
+  ret void
+}
+
+define void @offset_legal_for_add_imm_4_stores(ptr %p) {
+; CHECK-NO-SPLIT-LABEL: offset_legal_for_add_imm_4_stores:
+; CHECK-NO-SPLIT:       ; %bb.0:
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #3200]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #3208]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #3216]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #3224]
+; CHECK-NO-SPLIT-NEXT:    ret
+;
+; CHECK-SPLIT-LABEL: offset_legal_for_add_imm_4_stores:
+; CHECK-SPLIT:       ; %bb.0:
+; CHECK-SPLIT-NEXT:    add x8, x0, #3200
+; CHECK-SPLIT-NEXT:    stp xzr, xzr, [x8]
+; CHECK-SPLIT-NEXT:    stp xzr, xzr, [x8, #16]
+; CHECK-SPLIT-NEXT:    ret
+  %bigoffset = getelementptr i64, ptr %p, i64 400
+  store i64 0, ptr %bigoffset
+  %addr2 = getelementptr i64, ptr %p, i64 401
+  store i64 0, ptr %addr2
+  %addr3 = getelementptr i64, ptr %p, i64 402
+  store i64 0, ptr %addr3
+  %addr4 = getelementptr i64, ptr %p, i64 403
+  store i64 0, ptr %addr4
+  ret void
+}
+
+define void @offset_illegal_for_add_imm_4_stores(ptr %p) {
+; CHECK-NO-SPLIT-LABEL: offset_illegal_for_add_imm_4_stores:
+; CHECK-NO-SPLIT:       ; %bb.0:
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8000]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8008]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8016]
+; CHECK-NO-SPLIT-NEXT:    str xzr, [x0, #8024]
+; CHECK-NO-SPLIT-NEXT:    ret
+;
+; CHECK-SPLIT-LABEL: offset_illegal_for_add_imm_4_stores:
+; CHECK-SPLIT:       ; %bb.0:
+; CHECK-SPLIT-NEXT:    str xzr, [x0, #8000]
+; CHECK-SPLIT-NEXT:    str xzr, [x0, #8008]
+; CHECK-SPLIT-NEXT:    str xzr, [x0, #8016]
+; CHECK-SPLIT-NEXT:    str xzr, [x0, #8024]
+; CHECK-SPLIT-NEXT:    ret
+  %bigoffset = getelementptr i64, ptr %p, i64 1000
+  store i64 0, ptr %bigoffset
+  %addr2 = getelementptr i64, ptr %p, i64 1001
+  store i64 0, ptr %addr2
+  %addr3 = getelementptr i64, ptr %p, i64 1002
+  store i64 0, ptr %addr3
+  %addr4 = getelementptr i64, ptr %p, i64 1003
+  store i64 0, ptr %addr4
+  ret void
+}



More information about the llvm-commits mailing list