[llvm] d081962 - Merge memtag instructions with adjacent stack slots.

Fri Jan 17 15:19:39 PST 2020

Author: Evgenii Stepanov
Date: 2020-01-17T15:19:29-08:00
New Revision: d081962dead08ef0982081e78d679dd609947ca5

URL: https://github.com/llvm/llvm-project/commit/d081962dead08ef0982081e78d679dd609947ca5
DIFF: https://github.com/llvm/llvm-project/commit/d081962dead08ef0982081e78d679dd609947ca5.diff

LOG: Merge memtag instructions with adjacent stack slots.

Summary:
Detect a run of memory tagging instructions for adjacent stack frame slots,
and replace them with a shorter instruction sequence
* replace STG + STG with ST2G
* replace STGloop + STGloop with STGloop

This code needs to run when stack slot offsets are already known, but before
FrameIndex operands in STG instructions are eliminated; that's the
reason for the new hook in PrologueEpilogue.

This change modifies STGloop and STZGloop pseudos to take the size as an
immediate integer operand, and adds _untied variants of those pseudos
that are allowed to take the base address as a FI operand. This is needed to
simplify recognizing an STGloop instruction as operating on a stack slot
post-regalloc.

This improves memtag code size by ~0.25%, and it looks like an additional ~0.1%
is possible by rearranging the stack frame such that consecutive STG
instructions reference adjacent slots (patch pending).

Reviewers: pcc, ostannard

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D70286

Added: 
    llvm/test/CodeGen/AArch64/settag-merge.ll
    llvm/test/CodeGen/AArch64/settag-merge.mir

Modified: 
    llvm/include/llvm/CodeGen/TargetFrameLowering.h
    llvm/lib/CodeGen/PrologEpilogInserter.cpp
    llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
    llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
    llvm/lib/Target/AArch64/AArch64FrameLowering.h
    llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
    llvm/lib/Target/AArch64/AArch64InstrInfo.td
    llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
    llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
    llvm/test/CodeGen/AArch64/settag.ll
    llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
index c7d4c4d7e5d4..a0beee36c748 100644

--- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
@@ -309,6 +309,13 @@ class TargetFrameLowering {
                                              RegScavenger *RS = nullptr) const {
   }
 
+  /// processFunctionBeforeFrameIndicesReplaced - This method is called
+  /// immediately before MO_FrameIndex operands are eliminated, but after the
+  /// frame is finalized. This method is optional.
+  virtual void
+  processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF,
+                                            RegScavenger *RS = nullptr) const {}
+
   virtual unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const {
     report_fatal_error("WinEH not implemented for this target");
   }

diff  --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index 3909b5717281..d583643ac68f 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -259,6 +259,10 @@ bool PEI::runOnMachineFunction(MachineFunction &MF) {
   for (auto &I : EntryDbgValues)
     I.first->insert(I.first->begin(), I.second.begin(), I.second.end());
 
+  // Allow the target machine to make final modifications to the function
+  // before the frame layout is finalized.
+  TFI->processFunctionBeforeFrameIndicesReplaced(MF, RS);
+
   // Replace all MO_FrameIndex operands with physical register references
   // and actual offsets.
   //

diff  --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 3b8f8a19fe49..6bce30fab078 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -349,22 +349,38 @@ bool AArch64ExpandPseudo::expandSetTagLoop(
     MachineBasicBlock::iterator &NextMBBI) {
   MachineInstr &MI = *MBBI;
   DebugLoc DL = MI.getDebugLoc();
-  Register SizeReg = MI.getOperand(2).getReg();
-  Register AddressReg = MI.getOperand(3).getReg();
+  Register SizeReg = MI.getOperand(0).getReg();
+  Register AddressReg = MI.getOperand(1).getReg();
 
   MachineFunction *MF = MBB.getParent();
 
-  bool ZeroData = MI.getOpcode() == AArch64::STZGloop;
-  const unsigned OpCode =
+  bool ZeroData = MI.getOpcode() == AArch64::STZGloop_wback;
+  const unsigned OpCode1 =
+      ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex;
+  const unsigned OpCode2 =
       ZeroData ? AArch64::STZ2GPostIndex : AArch64::ST2GPostIndex;
 
+  unsigned Size = MI.getOperand(2).getImm();
+  assert(Size > 0 && Size % 16 == 0);
+  if (Size % (16 * 2) != 0) {
+    BuildMI(MBB, MBBI, DL, TII->get(OpCode1), AddressReg)
+        .addReg(AddressReg)
+        .addReg(AddressReg)
+        .addImm(1);
+    Size -= 16;
+  }
+  MachineBasicBlock::iterator I =
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), SizeReg)
+          .addImm(Size);
+  expandMOVImm(MBB, I, 64);
+
   auto LoopBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
   auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
 
   MF->insert(++MBB.getIterator(), LoopBB);
   MF->insert(++LoopBB->getIterator(), DoneBB);
 
-  BuildMI(LoopBB, DL, TII->get(OpCode))
+  BuildMI(LoopBB, DL, TII->get(OpCode2))
       .addDef(AddressReg)
       .addReg(AddressReg)
       .addReg(AddressReg)
@@ -706,9 +722,14 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
      MI.eraseFromParent();
      return true;
    }
+   case AArch64::STGloop_wback:
+   case AArch64::STZGloop_wback:
+     return expandSetTagLoop(MBB, MBBI, NextMBBI);
    case AArch64::STGloop:
    case AArch64::STZGloop:
-     return expandSetTagLoop(MBB, MBBI, NextMBBI);
+     report_fatal_error(
+         "Non-writeback variants of STGloop / STZGloop should not "
+         "survive past PrologEpilogInserter.");
   }
   return false;
 }

diff  --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index ea3e800a1ad2..de726b326197 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -170,6 +170,11 @@ static cl::opt<bool>
                          cl::desc("reverse the CSR restore sequence"),
                          cl::init(false), cl::Hidden);
 
+static cl::opt<bool> StackTaggingMergeSetTag(
+    "stack-tagging-merge-settag",
+    cl::desc("merge settag instruction in function epilog"), cl::init(true),
+    cl::Hidden);
+
 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
 
 /// This is the biggest offset to the stack pointer we can encode in aarch64
@@ -480,6 +485,39 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
   return true;
 }
 
+bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
+    MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
+  if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
+    return false;
+
+  if (MBB.empty())
+    return true;
+
+  // Disable combined SP bump if the last instruction is an MTE tag store. It
+  // is almost always better to merge SP adjustment into those instructions.
+  MachineBasicBlock::iterator LastI = MBB.getFirstTerminator();
+  MachineBasicBlock::iterator Begin = MBB.begin();
+  while (LastI != Begin) {
+    --LastI;
+    if (LastI->isTransient())
+      continue;
+    if (!LastI->getFlag(MachineInstr::FrameDestroy))
+      break;
+  }
+  switch (LastI->getOpcode()) {
+  case AArch64::STGloop:
+  case AArch64::STZGloop:
+  case AArch64::STGOffset:
+  case AArch64::STZGOffset:
+  case AArch64::ST2GOffset:
+  case AArch64::STZ2GOffset:
+    return false;
+  default:
+    return true;
+  }
+  llvm_unreachable("unreachable");
+}
+
 // Given a load or a store instruction, generate an appropriate unwinding SEH
 // code on Windows.
 static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
@@ -1463,7 +1501,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   // function.
   if (MF.hasEHFunclets())
     AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
-  bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
+  bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
   // Assume we can't combine the last pop with the sp restore.
 
   if (!CombineSPBump && PrologueSaveSize != 0) {
@@ -2649,9 +2687,399 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
       .addImm(0);
 }
 
-/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP before
-/// the update.  This is easily retrieved as it is exactly the offset that is set
-/// in processFunctionBeforeFrameFinalized.
+namespace {
+struct TagStoreInstr {
+  MachineInstr *MI;
+  int64_t Offset, Size;
+  explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
+      : MI(MI), Offset(Offset), Size(Size) {}
+};
+
+class TagStoreEdit {
+  MachineFunction *MF;
+  MachineBasicBlock *MBB;
+  MachineRegisterInfo *MRI;
+  // Tag store instructions that are being replaced.
+  SmallVector<TagStoreInstr, 8> TagStores;
+  // Combined memref arguments of the above instructions.
+  SmallVector<MachineMemOperand *, 8> CombinedMemRefs;
+
+  // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
+  // FrameRegOffset + Size) with the address tag of SP.
+  Register FrameReg;
+  StackOffset FrameRegOffset;
+  int64_t Size;
+  // If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end.
+  Optional<int64_t> FrameRegUpdate;
+  // MIFlags for any FrameReg updating instructions.
+  unsigned FrameRegUpdateFlags;
+
+  // Use zeroing instruction variants.
+  bool ZeroData;
+  DebugLoc DL;
+
+  void emitUnrolled(MachineBasicBlock::iterator InsertI);
+  void emitLoop(MachineBasicBlock::iterator InsertI);
+
+public:
+  TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
+      : MBB(MBB), ZeroData(ZeroData) {
+    MF = MBB->getParent();
+    MRI = &MF->getRegInfo();
+  }
+  // Add an instruction to be replaced. Instructions must be added in the
+  // ascending order of Offset, and have to be adjacent.
+  void addInstruction(TagStoreInstr I) {
+    assert((TagStores.empty() ||
+            TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
+           "Non-adjacent tag store instructions.");
+    TagStores.push_back(I);
+  }
+  void clear() { TagStores.clear(); }
+  // Emit equivalent code at the given location, and erase the current set of
+  // instructions. May skip if the replacement is not profitable. May invalidate
+  // the input iterator and replace it with a valid one.
+  void emitCode(MachineBasicBlock::iterator &InsertI,
+                const AArch64FrameLowering *TFI, bool IsLast);
+};
+
+void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
+  const AArch64InstrInfo *TII =
+      MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
+
+  const int64_t kMinOffset = -256 * 16;
+  const int64_t kMaxOffset = 255 * 16;
+
+  Register BaseReg = FrameReg;
+  int64_t BaseRegOffsetBytes = FrameRegOffset.getBytes();
+  if (BaseRegOffsetBytes < kMinOffset ||
+      BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) {
+    Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+    emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
+                    {BaseRegOffsetBytes, MVT::i8}, TII);
+    BaseReg = ScratchReg;
+    BaseRegOffsetBytes = 0;
+  }
+
+  MachineInstr *LastI = nullptr;
+  while (Size) {
+    int64_t InstrSize = (Size > 16) ? 32 : 16;
+    unsigned Opcode =
+        InstrSize == 16
+            ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset)
+            : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset);
+    MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode))
+                          .addReg(AArch64::SP)
+                          .addReg(BaseReg)
+                          .addImm(BaseRegOffsetBytes / 16)
+                          .setMemRefs(CombinedMemRefs);
+    // A store to [BaseReg, #0] should go last for an opportunity to fold the
+    // final SP adjustment in the epilogue.
+    if (BaseRegOffsetBytes == 0)
+      LastI = I;
+    BaseRegOffsetBytes += InstrSize;
+    Size -= InstrSize;
+  }
+
+  if (LastI)
+    MBB->splice(InsertI, MBB, LastI);
+}
+
+void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
+  const AArch64InstrInfo *TII =
+      MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
+
+  Register BaseReg = FrameRegUpdate
+                         ? FrameReg
+                         : MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+  Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+
+  emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII);
+
+  int64_t LoopSize = Size;
+  // If the loop size is not a multiple of 32, split off one 16-byte store at
+  // the end to fold BaseReg update into.
+  if (FrameRegUpdate && *FrameRegUpdate)
+    LoopSize -= LoopSize % 32;
+  MachineInstr *LoopI = BuildMI(*MBB, InsertI, DL,
+                                TII->get(ZeroData ? AArch64::STZGloop_wback
+                                                  : AArch64::STGloop_wback))
+                            .addDef(SizeReg)
+                            .addDef(BaseReg)
+                            .addImm(LoopSize)
+                            .addReg(BaseReg)
+                            .setMemRefs(CombinedMemRefs);
+  if (FrameRegUpdate)
+    LoopI->setFlags(FrameRegUpdateFlags);
+
+  int64_t ExtraBaseRegUpdate =
+      FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getBytes() - Size) : 0;
+  if (LoopSize < Size) {
+    assert(FrameRegUpdate);
+    assert(Size - LoopSize == 16);
+    // Tag 16 more bytes at BaseReg and update BaseReg.
+    BuildMI(*MBB, InsertI, DL,
+            TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
+        .addDef(BaseReg)
+        .addReg(BaseReg)
+        .addReg(BaseReg)
+        .addImm(1 + ExtraBaseRegUpdate / 16)
+        .setMemRefs(CombinedMemRefs)
+        .setMIFlags(FrameRegUpdateFlags);
+  } else if (ExtraBaseRegUpdate) {
+    // Update BaseReg.
+    BuildMI(
+        *MBB, InsertI, DL,
+        TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
+        .addDef(BaseReg)
+        .addReg(BaseReg)
+        .addImm(std::abs(ExtraBaseRegUpdate))
+        .addImm(0)
+        .setMIFlags(FrameRegUpdateFlags);
+  }
+}
+
+// Check if *II is a register update that can be merged into STGloop that ends
+// at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
+// end of the loop.
+bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
+                       int64_t Size, int64_t *TotalOffset) {
+  MachineInstr &MI = *II;
+  if ((MI.getOpcode() == AArch64::ADDXri ||
+       MI.getOpcode() == AArch64::SUBXri) &&
+      MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) {
+    unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm());
+    int64_t Offset = MI.getOperand(2).getImm() << Shift;
+    if (MI.getOpcode() == AArch64::SUBXri)
+      Offset = -Offset;
+    int64_t AbsPostOffset = std::abs(Offset - Size);
+    const int64_t kMaxOffset =
+        0xFFF; // Max encoding for unshifted ADDXri / SUBXri
+    if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) {
+      *TotalOffset = Offset;
+      return true;
+    }
+  }
+  return false;
+}
+
+void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
+                  SmallVectorImpl<MachineMemOperand *> &MemRefs) {
+  MemRefs.clear();
+  for (auto &TS : TSE) {
+    MachineInstr *MI = TS.MI;
+    // An instruction without memory operands may access anything. Be
+    // conservative and return an empty list.
+    if (MI->memoperands_empty()) {
+      MemRefs.clear();
+      return;
+    }
+    MemRefs.append(MI->memoperands_begin(), MI->memoperands_end());
+  }
+}
+
+void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
+                            const AArch64FrameLowering *TFI, bool IsLast) {
+  if (TagStores.empty())
+    return;
+  TagStoreInstr &FirstTagStore = TagStores[0];
+  TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
+  Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
+  DL = TagStores[0].MI->getDebugLoc();
+
+  unsigned Reg;
+  FrameRegOffset = TFI->resolveFrameOffsetReference(
+      *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg,
+      /*PreferFP=*/false, /*ForSimm=*/true);
+  FrameReg = Reg;
+  FrameRegUpdate = None;
+
+  mergeMemRefs(TagStores, CombinedMemRefs);
+
+  LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n";
+             for (const auto &Instr
+                  : TagStores) { dbgs() << "  " << *Instr.MI; });
+
+  // Size threshold where a loop becomes shorter than a linear sequence of
+  // tagging instructions.
+  const int kSetTagLoopThreshold = 176;
+  if (Size < kSetTagLoopThreshold) {
+    if (TagStores.size() < 2)
+      return;
+    emitUnrolled(InsertI);
+  } else {
+    MachineInstr *UpdateInstr = nullptr;
+    int64_t TotalOffset;
+    if (IsLast) {
+      // See if we can merge base register update into the STGloop.
+      // This is done in AArch64LoadStoreOptimizer for "normal" stores,
+      // but STGloop is way too unusual for that, and also it only
+      // realistically happens in function epilogue. Also, STGloop is expanded
+      // before that pass.
+      if (InsertI != MBB->end() &&
+          canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getBytes() + Size,
+                            &TotalOffset)) {
+        UpdateInstr = &*InsertI++;
+        LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n  "
+                          << *UpdateInstr);
+      }
+    }
+
+    if (!UpdateInstr && TagStores.size() < 2)
+      return;
+
+    if (UpdateInstr) {
+      FrameRegUpdate = TotalOffset;
+      FrameRegUpdateFlags = UpdateInstr->getFlags();
+    }
+    emitLoop(InsertI);
+    if (UpdateInstr)
+      UpdateInstr->eraseFromParent();
+  }
+
+  for (auto &TS : TagStores)
+    TS.MI->eraseFromParent();
+}
+
+bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
+                                        int64_t &Size, bool &ZeroData) {
+  MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  unsigned Opcode = MI.getOpcode();
+  ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset ||
+              Opcode == AArch64::STZ2GOffset);
+
+  if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) {
+    if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead())
+      return false;
+    if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI())
+      return false;
+    Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex());
+    Size = MI.getOperand(2).getImm();
+    return true;
+  }
+
+  if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset)
+    Size = 16;
+  else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset)
+    Size = 32;
+  else
+    return false;
+
+  if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI())
+    return false;
+
+  Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) +
+           16 * MI.getOperand(2).getImm();
+  return true;
+}
+
+// Detect a run of memory tagging instructions for adjacent stack frame slots,
+// and replace them with a shorter instruction sequence:
+// * replace STG + STG with ST2G
+// * replace STGloop + STGloop with STGloop
+// This code needs to run when stack slot offsets are already known, but before
+// FrameIndex operands in STG instructions are eliminated.
+MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
+                                                const AArch64FrameLowering *TFI,
+                                                RegScavenger *RS) {
+  bool FirstZeroData;
+  int64_t Size, Offset;
+  MachineInstr &MI = *II;
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineBasicBlock::iterator NextI = ++II;
+  if (&MI == &MBB->instr_back())
+    return II;
+  if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData))
+    return II;
+
+  SmallVector<TagStoreInstr, 4> Instrs;
+  Instrs.emplace_back(&MI, Offset, Size);
+
+  constexpr int kScanLimit = 10;
+  int Count = 0;
+  for (MachineBasicBlock::iterator E = MBB->end();
+       NextI != E && Count < kScanLimit; ++NextI) {
+    MachineInstr &MI = *NextI;
+    bool ZeroData;
+    int64_t Size, Offset;
+    // Collect instructions that update memory tags with a FrameIndex operand
+    // and (when applicable) constant size, and whose output registers are dead
+    // (the latter is almost always the case in practice). Since these
+    // instructions effectively have no inputs or outputs, we are free to skip
+    // any non-aliasing instructions in between without tracking used registers.
+    if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
+      if (ZeroData != FirstZeroData)
+        break;
+      Instrs.emplace_back(&MI, Offset, Size);
+      continue;
+    }
+
+    // Only count non-transient, non-tagging instructions toward the scan
+    // limit.
+    if (!MI.isTransient())
+      ++Count;
+
+    // Just in case, stop before the epilogue code starts.
+    if (MI.getFlag(MachineInstr::FrameSetup) ||
+        MI.getFlag(MachineInstr::FrameDestroy))
+      break;
+
+    // Reject anything that may alias the collected instructions.
+    if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects())
+      break;
+  }
+
+  // New code will be inserted after the last tagging instruction we've found.
+  MachineBasicBlock::iterator InsertI = Instrs.back().MI;
+  InsertI++;
+
+  llvm::stable_sort(Instrs,
+                    [](const TagStoreInstr &Left, const TagStoreInstr &Right) {
+                      return Left.Offset < Right.Offset;
+                    });
+
+  // Make sure that we don't have any overlapping stores.
+  int64_t CurOffset = Instrs[0].Offset;
+  for (auto &Instr : Instrs) {
+    if (CurOffset > Instr.Offset)
+      return NextI;
+    CurOffset = Instr.Offset + Instr.Size;
+  }
+
+  // Find contiguous runs of tagged memory and emit shorter instruction
+  // sequencies for them when possible.
+  TagStoreEdit TSE(MBB, FirstZeroData);
+  Optional<int64_t> EndOffset;
+  for (auto &Instr : Instrs) {
+    if (EndOffset && *EndOffset != Instr.Offset) {
+      // Found a gap.
+      TSE.emitCode(InsertI, TFI, /*IsLast = */ false);
+      TSE.clear();
+    }
+
+    TSE.addInstruction(Instr);
+    EndOffset = Instr.Offset + Instr.Size;
+  }
+
+  TSE.emitCode(InsertI, TFI, /*IsLast = */ true);
+
+  return InsertI;
+}
+} // namespace
+
+void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
+    MachineFunction &MF, RegScavenger *RS = nullptr) const {
+  if (StackTaggingMergeSetTag)
+    for (auto &BB : MF)
+      for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
+        II = tryMergeAdjacentSTG(II, this, RS);
+}
+
+/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
+/// before the update.  This is easily retrieved as it is exactly the offset
+/// that is set in processFunctionBeforeFrameFinalized.
 int AArch64FrameLowering::getFrameIndexReferencePreferSP(
     const MachineFunction &MF, int FI, unsigned &FrameReg,
     bool IgnoreSPUpdates) const {

diff  --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index b5719feb6b15..57a7924fb8f8 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -77,6 +77,10 @@ class AArch64FrameLowering : public TargetFrameLowering {
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                              RegScavenger *RS) const override;
 
+  void
+  processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF,
+                                            RegScavenger *RS) const override;
+
   unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override;
 
   unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const;
@@ -107,6 +111,8 @@ class AArch64FrameLowering : public TargetFrameLowering {
   int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF,
                                       int &MinCSFrameIndex,
                                       int &MaxCSFrameIndex) const;
+  bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB,
+                                                unsigned StackBumpBytes) const;
 };
 
 } // End llvm namespace

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 54f3f7c10132..0ed2a678c4f0 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -3458,6 +3458,8 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
   case AArch64::ST1Fourv1d:
   case AArch64::IRG:
   case AArch64::IRGstack:
+  case AArch64::STGloop:
+  case AArch64::STZGloop:
     return AArch64FrameOffsetCannotUpdate;
   }
 

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index d590d4d913ff..5650d9140821 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1538,17 +1538,29 @@ def TAGPstack
 // register / expression for the tagged base pointer of the current function.
 def : Pat<(int_aarch64_irg_sp i64:$Rm), (IRGstack SP, i64:$Rm)>;
 
-// Large STG to be expanded into a loop. $Rm is the size, $Rn is start address.
-// $Rn_wback is one past the end of the range.
+// Large STG to be expanded into a loop. $sz is the size, $Rn is start address.
+// $Rn_wback is one past the end of the range. $Rm is the loop counter.
 let isCodeGenOnly=1, mayStore=1 in {
+def STGloop_wback
+    : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn),
+             [], "$Rn = $Rn_wback, at earlyclobber $Rn_wback, at earlyclobber $Rm" >,
+      Sched<[WriteAdr, WriteST]>;
+
+def STZGloop_wback
+    : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn),
+             [], "$Rn = $Rn_wback, at earlyclobber $Rn_wback, at earlyclobber $Rm" >,
+      Sched<[WriteAdr, WriteST]>;
+
+// A variant of the above where $Rn2 is an independent register not tied to the input register $Rn.
+// Their purpose is to use a FrameIndex operand as $Rn (which of course can not be written back).
 def STGloop
-    : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn),
-             [], "$Rn = $Rn_wback, at earlyclobber $Rn_wback,$Rm = $Rm_wback, at earlyclobber $Rm_wback" >,
+    : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn2), (ins i64imm:$sz, GPR64sp:$Rn),
+             [], "@earlyclobber $Rn2, at earlyclobber $Rm" >,
       Sched<[WriteAdr, WriteST]>;
 
 def STZGloop
-    : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn),
-             [], "$Rn = $Rn_wback, at earlyclobber $Rn_wback,$Rm = $Rm_wback, at earlyclobber $Rm_wback" >,
+    : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn2), (ins i64imm:$sz, GPR64sp:$Rn),
+             [], "@earlyclobber $Rn2, at earlyclobber $Rm" >,
       Sched<[WriteAdr, WriteST]>;
 }
 

diff  --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 14f839cd4f81..cdfbc0f5f691 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -390,6 +390,10 @@ bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
   if (isFrameOffsetLegal(MI, AArch64::SP, Offset))
     return false;
 
+  // If even offset 0 is illegal, we don't want a virtual base register.
+  if (!isFrameOffsetLegal(MI, AArch64::SP, 0))
+    return false;
+
   // The offset likely isn't legal; we want to allocate a virtual base register.
   return true;
 }
@@ -445,6 +449,27 @@ void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
   (void)Done;
 }
 
+// Create a scratch register for the frame index elimination in an instruction.
+// This function has special handling of stack tagging loop pseudos, in which
+// case it can also change the instruction opcode (but not the operands).
+static Register
+createScratchRegisterForInstruction(MachineInstr &MI,
+                                    const AArch64InstrInfo *TII) {
+  // ST*Gloop have a reserved scratch register in operand 1. Use it, and also
+  // replace the instruction with the writeback variant because it will now
+  // satisfy the operand constraints for it.
+  if (MI.getOpcode() == AArch64::STGloop) {
+    MI.setDesc(TII->get(AArch64::STGloop_wback));
+    return MI.getOperand(1).getReg();
+  } else if (MI.getOpcode() == AArch64::STZGloop) {
+    MI.setDesc(TII->get(AArch64::STZGloop_wback));
+    return MI.getOperand(1).getReg();
+  } else {
+    return MI.getMF()->getRegInfo().createVirtualRegister(
+        &AArch64::GPR64RegClass);
+  }
+}
+
 void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                               int SPAdj, unsigned FIOperandNum,
                                               RegScavenger *RS) const {
@@ -531,8 +556,7 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // If we get here, the immediate doesn't fit into the instruction.  We folded
   // as much as possible above.  Handle the rest, providing a register that is
   // SP+LargeImm.
-  Register ScratchReg =
-      MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+  Register ScratchReg = createScratchRegisterForInstruction(MI, TII);
   emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
   MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
 }

diff  --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index ba61ed726e84..65c089a1d37f 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -125,21 +125,18 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForSetTag(
     return EmitUnrolledSetTag(DAG, dl, Chain, Addr, ObjSize, BaseMemOperand,
                               ZeroData);
 
-  if (ObjSize % 32 != 0) {
-    SDNode *St1 = DAG.getMachineNode(
-        ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex, dl,
-        {MVT::i64, MVT::Other},
-        {Addr, Addr, DAG.getTargetConstant(1, dl, MVT::i64), Chain});
-    DAG.setNodeMemRefs(cast<MachineSDNode>(St1), {BaseMemOperand});
-    ObjSize -= 16;
-    Addr = SDValue(St1, 0);
-    Chain = SDValue(St1, 1);
-  }
-
   const EVT ResTys[] = {MVT::i64, MVT::i64, MVT::Other};
-  SDValue Ops[] = {DAG.getConstant(ObjSize, dl, MVT::i64), Addr, Chain};
-  SDNode *St = DAG.getMachineNode(
-      ZeroData ? AArch64::STZGloop : AArch64::STGloop, dl, ResTys, Ops);
+
+  unsigned Opcode;
+  if (Addr.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(Addr)->getIndex();
+    Addr = DAG.getTargetFrameIndex(FI, MVT::i64);
+    Opcode = ZeroData ? AArch64::STZGloop : AArch64::STGloop;
+  } else {
+    Opcode = ZeroData ? AArch64::STZGloop_wback : AArch64::STGloop_wback;
+  }
+  SDValue Ops[] = {DAG.getTargetConstant(ObjSize, dl, MVT::i64), Addr, Chain};
+  SDNode *St = DAG.getMachineNode(Opcode, dl, ResTys, Ops);
 
   DAG.setNodeMemRefs(cast<MachineSDNode>(St), {BaseMemOperand});
   return SDValue(St, 2);

diff  --git a/llvm/test/CodeGen/AArch64/settag-merge.ll b/llvm/test/CodeGen/AArch64/settag-merge.ll
new file mode 100644
index 000000000000..1bc93a82070f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/settag-merge.ll
@@ -0,0 +1,214 @@
+; RUN: llc < %s -mtriple=aarch64 -mattr=+mte | FileCheck %s
+
+declare void @use(i8* %p)
+declare void @llvm.aarch64.settag(i8* %p, i64 %a)
+declare void @llvm.aarch64.settag.zero(i8* %p, i64 %a)
+
+define void @stg16_16() {
+entry:
+; CHECK-LABEL: stg16_16:
+; CHECK: st2g sp, [sp], #32
+; CHECK: ret
+  %a = alloca i8, i32 16, align 16
+  %b = alloca i8, i32 16, align 16
+  call void @llvm.aarch64.settag(i8* %a, i64 16)
+  call void @llvm.aarch64.settag(i8* %b, i64 16)
+  ret void
+}
+
+define i32 @stg16_16_16_16_ret() {
+entry:
+; CHECK-LABEL: stg16_16_16_16_ret:
+; CHECK: st2g sp, [sp, #32]
+; CHECK: st2g sp, [sp], #64
+; CHECK: mov  w0, wzr
+; CHECK: ret
+  %a = alloca i8, i32 16, align 16
+  %b = alloca i8, i32 16, align 16
+  %c = alloca i8, i32 16, align 16
+  %d = alloca i8, i32 16, align 16
+  call void @llvm.aarch64.settag(i8* %a, i64 16)
+  call void @llvm.aarch64.settag(i8* %b, i64 16)
+  call void @llvm.aarch64.settag(i8* %c, i64 16)
+  call void @llvm.aarch64.settag(i8* %d, i64 16)
+  ret i32 0
+}
+
+define void @stg16_16_16_16() {
+entry:
+; CHECK-LABEL: stg16_16_16_16:
+; CHECK: st2g sp, [sp, #32]
+; CHECK: st2g sp, [sp], #64
+; CHECK: ret
+  %a = alloca i8, i32 16, align 16
+  %b = alloca i8, i32 16, align 16
+  %c = alloca i8, i32 16, align 16
+  %d = alloca i8, i32 16, align 16
+  call void @llvm.aarch64.settag(i8* %a, i64 16)
+  call void @llvm.aarch64.settag(i8* %b, i64 16)
+  call void @llvm.aarch64.settag(i8* %c, i64 16)
+  call void @llvm.aarch64.settag(i8* %d, i64 16)
+  ret void
+}
+
+define void @stg128_128_128_128() {
+entry:
+; CHECK-LABEL: stg128_128_128_128:
+; CHECK: mov     x8, #512
+; CHECK: st2g    sp, [sp], #32
+; CHECK: sub     x8, x8, #32
+; CHECK: cbnz    x8,
+; CHECK: ret
+  %a = alloca i8, i32 128, align 16
+  %b = alloca i8, i32 128, align 16
+  %c = alloca i8, i32 128, align 16
+  %d = alloca i8, i32 128, align 16
+  call void @llvm.aarch64.settag(i8* %a, i64 128)
+  call void @llvm.aarch64.settag(i8* %b, i64 128)
+  call void @llvm.aarch64.settag(i8* %c, i64 128)
+  call void @llvm.aarch64.settag(i8* %d, i64 128)
+  ret void
+}
+
+define void @stg16_512_16() {
+entry:
+; CHECK-LABEL: stg16_512_16:
+; CHECK: mov     x8, #544
+; CHECK: st2g    sp, [sp], #32
+; CHECK: sub     x8, x8, #32
+; CHECK: cbnz    x8,
+; CHECK: ret
+  %a = alloca i8, i32 16, align 16
+  %b = alloca i8, i32 512, align 16
+  %c = alloca i8, i32 16, align 16
+  call void @llvm.aarch64.settag(i8* %a, i64 16)
+  call void @llvm.aarch64.settag(i8* %b, i64 512)
+  call void @llvm.aarch64.settag(i8* %c, i64 16)
+  ret void
+}
+
+define void @stg512_512_512() {
+entry:
+; CHECK-LABEL: stg512_512_512:
+; CHECK: mov     x8, #1536
+; CHECK: st2g    sp, [sp], #32
+; CHECK: sub     x8, x8, #32
+; CHECK: cbnz    x8,
+; CHECK: ret
+  %a = alloca i8, i32 512, align 16
+  %b = alloca i8, i32 512, align 16
+  %c = alloca i8, i32 512, align 16
+  call void @llvm.aarch64.settag(i8* %a, i64 512)
+  call void @llvm.aarch64.settag(i8* %b, i64 512)
+  call void @llvm.aarch64.settag(i8* %c, i64 512)
+  ret void
+}
+
+define void @early(i1 %flag) {
+entry:
+; CHECK-LABEL: early:
+; CHECK: tbz     w0, #0, [[LABEL:.LBB.*]]
+; CHECK: st2g    sp, [sp, #
+; CHECK: st2g    sp, [sp, #
+; CHECK: st2g    sp, [sp, #
+; CHECK: [[LABEL]]:
+; CHECK: stg     sp, [sp, #
+; CHECK: st2g    sp, [sp], #
+; CHECK: ret
+  %a = alloca i8, i32 48, align 16
+  %b = alloca i8, i32 48, align 16
+  %c = alloca i8, i32 48, align 16
+  br i1 %flag, label %if.then, label %if.end
+
+if.then:
+  call void @llvm.aarch64.settag(i8* %a, i64 48)
+  call void @llvm.aarch64.settag(i8* %b, i64 48)
+  br label %if.end
+
+if.end:
+  call void @llvm.aarch64.settag(i8* %c, i64 48)
+  ret void
+}
+
+define void @early_128_128(i1 %flag) {
+entry:
+; CHECK-LABEL: early_128_128:
+; CHECK: tbz   w0, #0, [[LABEL:.LBB.*]]
+; CHECK: add   x9, sp, #
+; CHECK: mov   x8, #256
+; CHECK: st2g  x9, [x9], #32
+; CHECK: sub   x8, x8, #32
+; CHECK: cbnz  x8,
+; CHECK: [[LABEL]]:
+; CHECK: stg     sp, [sp, #
+; CHECK: st2g    sp, [sp], #
+; CHECK: ret
+  %a = alloca i8, i32 128, align 16
+  %b = alloca i8, i32 128, align 16
+  %c = alloca i8, i32 48, align 16
+  br i1 %flag, label %if.then, label %if.end
+
+if.then:
+  call void @llvm.aarch64.settag(i8* %a, i64 128)
+  call void @llvm.aarch64.settag(i8* %b, i64 128)
+  br label %if.end
+
+if.end:
+  call void @llvm.aarch64.settag(i8* %c, i64 48)
+  ret void
+}
+
+define void @early_512_512(i1 %flag) {
+entry:
+; CHECK-LABEL: early_512_512:
+; CHECK: tbz   w0, #0, [[LABEL:.LBB.*]]
+; CHECK: add   x9, sp, #
+; CHECK: mov   x8, #1024
+; CHECK: st2g  x9, [x9], #32
+; CHECK: sub   x8, x8, #32
+; CHECK: cbnz  x8,
+; CHECK: [[LABEL]]:
+; CHECK: stg     sp, [sp, #
+; CHECK: st2g    sp, [sp], #
+; CHECK: ret
+  %a = alloca i8, i32 512, align 16
+  %b = alloca i8, i32 512, align 16
+  %c = alloca i8, i32 48, align 16
+  br i1 %flag, label %if.then, label %if.end
+
+if.then:
+  call void @llvm.aarch64.settag(i8* %a, i64 512)
+  call void @llvm.aarch64.settag(i8* %b, i64 512)
+  br label %if.end
+
+if.end:
+  call void @llvm.aarch64.settag(i8* %c, i64 48)
+  ret void
+}
+
+; Two loops of size 256; the second loop updates SP.
+define void @stg128_128_gap_128_128() {
+entry:
+; CHECK-LABEL: stg128_128_gap_128_128:
+; CHECK: mov     x9, sp
+; CHECK: mov     x8, #256
+; CHECK: st2g    x9, [x9], #32
+; CHECK: sub     x8, x8, #32
+; CHECK: cbnz    x8,
+; CHECK: mov     x8, #256
+; CHECK: st2g    sp, [sp], #32
+; CHECK: sub     x8, x8, #32
+; CHECK: cbnz    x8,
+; CHECK: ret
+  %a = alloca i8, i32 128, align 16
+  %a2 = alloca i8, i32 128, align 16
+  %b = alloca i8, i32 32, align 16
+  %c = alloca i8, i32 128, align 16
+  %c2 = alloca i8, i32 128, align 16
+  call void @use(i8* %b)
+  call void @llvm.aarch64.settag(i8* %a, i64 128)
+  call void @llvm.aarch64.settag(i8* %a2, i64 128)
+  call void @llvm.aarch64.settag(i8* %c, i64 128)
+  call void @llvm.aarch64.settag(i8* %c2, i64 128)
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AArch64/settag-merge.mir b/llvm/test/CodeGen/AArch64/settag-merge.mir
new file mode 100644
index 000000000000..dc2a00c7d3d3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/settag-merge.mir
@@ -0,0 +1,83 @@
+# RUN: llc -mtriple=aarch64 -mattr=+mte -run-pass=prologepilog %s -o - | FileCheck %s
+
+--- |
+  declare void @llvm.aarch64.settag(i8* nocapture writeonly, i64) argmemonly nounwind writeonly "target-features"="+mte"
+  define i32 @stg16_16_16_16_ret() "target-features"="+mte" {
+  entry:
+    %a = alloca i8, i32 16, align 16
+    %b = alloca i8, i32 16, align 16
+    %c = alloca i8, i32 16, align 16
+    %d = alloca i8, i32 16, align 16
+    call void @llvm.aarch64.settag(i8* %a, i64 16)
+    call void @llvm.aarch64.settag(i8* %b, i64 16)
+    call void @llvm.aarch64.settag(i8* %c, i64 16)
+    call void @llvm.aarch64.settag(i8* %d, i64 16)
+    ret i32 0
+  }
+
+  define void @stg16_store_128() "target-features"="+mte" {
+  entry:
+    %a = alloca i8, i32 16, align 16
+    %b = alloca i8, i32 128, align 16
+    call void @llvm.aarch64.settag(i8* %a, i64 16)
+    store i8 42, i8* %a
+    call void @llvm.aarch64.settag(i8* %b, i64 128)
+    ret void
+  }
+
+...
+---
+# A sequence of STG with a register copy in the middle.
+# Can be merged into ST2G + ST2G.
+# CHECK-LABEL: name:{{.*}}stg16_16_16_16_ret
+# CHECK-DAG: ST2GOffset $sp, $sp, 2
+# CHECK-DAG: ST2GOffset $sp, $sp, 0
+# CHECK-DAG: $w0 = COPY $wzr
+# CHECK-DAG: RET_ReallyLR implicit killed $w0
+
+name:            stg16_16_16_16_ret
+tracksRegLiveness: true
+stack:
+  - { id: 0, name: a, size: 16, alignment: 16 }
+  - { id: 1, name: b, size: 16, alignment: 16 }
+  - { id: 2, name: c, size: 16, alignment: 16 }
+  - { id: 3, name: d, size: 16, alignment: 16 }
+body:             |
+  bb.0.entry:
+    STGOffset $sp, %stack.0.a, 0 :: (store 16 into %ir.a)
+    STGOffset $sp, %stack.1.b, 0 :: (store 16 into %ir.b)
+    STGOffset $sp, %stack.2.c, 0 :: (store 16 into %ir.c)
+    $w0 = COPY $wzr
+    STGOffset $sp, %stack.3.d, 0 :: (store 16 into %ir.d)
+    RET_ReallyLR implicit killed $w0
+
+...
+
+---
+# A store in the middle prevents merging.
+# CHECK-LABEL: name:{{.*}}stg16_store_128
+# CHECK: ST2GOffset $sp, $sp, 2
+# CHECK: ST2GOffset $sp, $sp, 4
+# CHECK: ST2GOffset $sp, $sp, 6
+# CHECK: STGOffset  $sp, $sp, 8
+# CHECK: STRBBui
+# CHECK: ST2GOffset $sp, $sp, 0 
+# CHECK: RET_ReallyLR
+
+name:            stg16_store_128
+tracksRegLiveness: true
+stack:
+  - { id: 0, name: a, size: 16, alignment: 16 }
+  - { id: 1, name: b, size: 128, alignment: 16 }
+body:             |
+  bb.0.entry:
+    STGOffset $sp, %stack.0.a, 0 :: (store 16 into %ir.a)
+    renamable $w8 = MOVi32imm 42
+    ST2GOffset $sp, %stack.1.b, 6 :: (store 32 into %ir.b + 96, align 16)
+    ST2GOffset $sp, %stack.1.b, 4 :: (store 32 into %ir.b + 64, align 16)
+    ST2GOffset $sp, %stack.1.b, 2 :: (store 32 into %ir.b + 32, align 16)
+    STRBBui killed renamable $w8, %stack.0.a, 0 :: (store 1 into %ir.a, align 16)
+    ST2GOffset $sp, %stack.1.b, 0 :: (store 32 into %ir.b, align 16)
+    RET_ReallyLR
+
+...

diff  --git a/llvm/test/CodeGen/AArch64/settag.ll b/llvm/test/CodeGen/AArch64/settag.ll
index 9ca188fbce32..3deeb0155fe8 100644
--- a/llvm/test/CodeGen/AArch64/settag.ll
+++ b/llvm/test/CodeGen/AArch64/settag.ll
@@ -64,8 +64,8 @@ entry:
 define void @stg17(i8* %p) {
 entry:
 ; CHECK-LABEL: stg17:
-; CHECK: mov  {{(w|x)}}[[R:[0-9]+]], #256
 ; CHECK: stg x0, [x0], #16
+; CHECK: mov  {{(w|x)}}[[R:[0-9]+]], #256
 ; CHECK: st2g x0, [x0], #32
 ; CHECK: sub  x[[R]], x[[R]], #32
 ; CHECK: cbnz x[[R]],
@@ -87,8 +87,8 @@ entry:
 define void @stzg17(i8* %p) {
 entry:
 ; CHECK-LABEL: stzg17:
-; CHECK: mov  {{w|x}}[[R:[0-9]+]], #256
 ; CHECK: stzg x0, [x0], #16
+; CHECK: mov  {{w|x}}[[R:[0-9]+]], #256
 ; CHECK: stz2g x0, [x0], #32
 ; CHECK: sub  x[[R]], x[[R]], #32
 ; CHECK: cbnz x[[R]],
@@ -110,10 +110,10 @@ entry:
 define void @stg_alloca5() {
 entry:
 ; CHECK-LABEL: stg_alloca5:
-; CHECK: stg  sp, [sp, #64]
-; CHECK: st2g sp, [sp, #32]
-; CHECK: st2g sp, [sp]
-; CHECK: ret
+; CHECK:         st2g    sp, [sp, #32]
+; CHECK-NEXT:    stg     sp, [sp, #64]
+; CHECK-NEXT:    st2g    sp, [sp], #80
+; CHECK-NEXT:    ret
   %a = alloca i8, i32 80, align 16
   call void @llvm.aarch64.settag(i8* %a, i64 80)
   ret void
@@ -122,12 +122,11 @@ entry:
 define void @stg_alloca17() {
 entry:
 ; CHECK-LABEL: stg_alloca17:
-; CHECK: mov [[P:x[0-9]+]], sp
-; CHECK: stg [[P]], {{\[}}[[P]]{{\]}}, #16
 ; CHECK: mov  {{w|x}}[[R:[0-9]+]], #256
-; CHECK: st2g [[P]], {{\[}}[[P]]{{\]}}, #32
+; CHECK: st2g sp, [sp], #32
 ; CHECK: sub  x[[R]], x[[R]], #32
 ; CHECK: cbnz x[[R]],
+; CHECK: stg sp, [sp], #16
 ; CHECK: ret
   %a = alloca i8, i32 272, align 16
   call void @llvm.aarch64.settag(i8* %a, i64 272)

diff  --git a/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll b/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll
index 200837dabfe0..ed6ccc8b4941 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll
@@ -210,11 +210,10 @@ entry:
 ; DEFAULT:  ldrb [[A:w.*]], [x{{.*}}]
 ; DEFAULT:  ldrb [[B:w.*]], [x{{.*}}]
 
-; ALWAYS: ldg [[PA:x.*]], [x{{.*}}]
-; ALWAYS: ldrb [[B:w.*]], [sp]
-; ALWAYS: ldrb [[A:w.*]], {{\[}}[[PA]]{{\]}}
+; ALWAYS-DAG: ldg [[PA:x.*]], [x{{.*}}]
+; ALWAYS-DAG: ldrb [[B:w.*]], [sp]
+; ALWAYS-DAG: ldrb [[A:w.*]], {{\[}}[[PA]]{{\]}}
 
-; COMMON: add w0, [[B]], [[A]]
 ; COMMON: ret
 
 ; One of these allocas is closer to FP than to SP, and within 256 bytes