[llvm] 2f63e57 - [MTE] Pin the tagged base pointer to one of the stack slots.

Thu Oct 15 13:06:34 PDT 2020

Author: Evgenii Stepanov
Date: 2020-10-15T12:50:16-07:00
New Revision: 2f63e57fa59e7fbfe5999ec1e6e60fa7a2ba70bb

URL: https://github.com/llvm/llvm-project/commit/2f63e57fa59e7fbfe5999ec1e6e60fa7a2ba70bb
DIFF: https://github.com/llvm/llvm-project/commit/2f63e57fa59e7fbfe5999ec1e6e60fa7a2ba70bb.diff

LOG: [MTE] Pin the tagged base pointer to one of the stack slots.

Summary:
Pin the tagged base pointer to one of the stack slots, and (if
necessary) rewrite tag offsets so that an object that occupies that
slot has both address and tag offsets of 0. This allows ADDG
instructions for that object to be eliminated and their uses replaced
with the tagged base pointer itself.

This optimization must be done in machine instructions and not in the IR
instrumentation pass, because referring to a stack slot through an IRG
pointer would confuse the stack coloring pass.

The optimization makes a (pretty naive) attempt to find the slot that
would benefit the most by counting the uses of stack slots in the
function.

Reviewers: ostannard, pcc

Subscribers: merge_guards_bot, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D72365

Added: 
    

Modified: 
    llvm/include/llvm/IR/IntrinsicsAArch64.td
    llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
    llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
    llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
    llvm/test/CodeGen/AArch64/irg_sp_tagp.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 248b4c429d6e..f07e6d6a2999 100644

--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -767,6 +767,10 @@ def int_aarch64_irg_sp   : Intrinsic<[llvm_ptr_ty], [llvm_i64_ty],
 // ptr1 = tagp(ptr0, baseptr, tag_offset) returns a pointer where
 // * address is the address in ptr0
 // * tag is a function of (tag in baseptr, tag_offset).
+// ** Beware, this is not the same function as implemented by the ADDG instruction!
+//    Backend optimizations may change tag_offset; the only guarantee is that calls
+//    to tagp with the same pair of (baseptr, tag_offset) will produce pointers
+//    with the same tag value, assuming the set of excluded tags has not changed.
 // Address bits in baseptr and tag bits in ptr0 are ignored.
 // When offset between ptr0 and baseptr is a compile time constant, this can be emitted as
 //   ADDG ptr1, baseptr, (ptr0 - baseptr), tag_offset

diff  --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 0d52b00d54ba..d52267272092 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1070,9 +1070,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
     return;
 
-  // Set tagged base pointer to the bottom of the stack frame.
+  // Set tagged base pointer to the requested stack slot.
   // Ideally it should match SP value after prologue.
-  AFI->setTaggedBasePointerOffset(MFI.getStackSize());
+  Optional<int> TBPI = AFI->getTaggedBasePointerIndex();
+  if (TBPI)
+    AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI));
+  else
+    AFI->setTaggedBasePointerOffset(MFI.getStackSize());
 
   const StackOffset &SVEStackSize = getSVEStackSize(MF);
 

diff  --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index cc07b9ae465a..f60e2b6c316e 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -128,10 +128,13 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// that must be forwarded to every musttail call.
   SmallVector<ForwardedRegister, 1> ForwardedMustTailRegParms;
 
-  // Offset from SP-at-entry to the tagged base pointer.
-  // Tagged base pointer is set up to point to the first (lowest address) tagged
-  // stack slot.
-  unsigned TaggedBasePointerOffset = 0;
+  /// FrameIndex for the tagged base pointer.
+  Optional<int> TaggedBasePointerIndex;
+
+  /// Offset from SP-at-entry to the tagged base pointer.
+  /// Tagged base pointer is set up to point to the first (lowest address)
+  /// tagged stack slot.
+  unsigned TaggedBasePointerOffset;
 
   /// OutliningStyle denotes, if a function was outined, how it was outlined,
   /// e.g. Tail Call, Thunk, or Function if none apply.
@@ -343,6 +346,11 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
     return ForwardedMustTailRegParms;
   }
 
+  Optional<int> getTaggedBasePointerIndex() const {
+    return TaggedBasePointerIndex;
+  }
+  void setTaggedBasePointerIndex(int Index) { TaggedBasePointerIndex = Index; }
+
   unsigned getTaggedBasePointerOffset() const {
     return TaggedBasePointerOffset;
   }

diff  --git a/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp b/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
index 73bd434ef123..41096a961330 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
@@ -13,7 +13,6 @@
 #include "AArch64InstrInfo.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -50,6 +49,12 @@ cl::opt<UncheckedLdStMode> ClUncheckedLdSt(
             "apply unchecked-ld-st when the target is definitely within range"),
         clEnumValN(UncheckedAlways, "always", "always apply unchecked-ld-st")));
 
+static cl::opt<bool>
+    ClFirstSlot("stack-tagging-first-slot-opt", cl::Hidden, cl::init(true),
+                cl::ZeroOrMore,
+                cl::desc("Apply first slot optimization for stack tagging "
+                         "(eliminate ADDG Rt, Rn, 0, 0)."));
+
 namespace {
 
 class AArch64StackTaggingPreRA : public MachineFunctionPass {
@@ -71,6 +76,7 @@ class AArch64StackTaggingPreRA : public MachineFunctionPass {
   bool mayUseUncheckedLoadStore();
   void uncheckUsesOf(unsigned TaggedReg, int FI);
   void uncheckLoadsAndStores();
+  Optional<int> findFirstSlotCandidate();
 
   bool runOnMachineFunction(MachineFunction &Func) override;
   StringRef getPassName() const override {
@@ -197,6 +203,141 @@ void AArch64StackTaggingPreRA::uncheckLoadsAndStores() {
   }
 }
 
+struct SlotWithTag {
+  int FI;
+  int Tag;
+  SlotWithTag(int FI, int Tag) : FI(FI), Tag(Tag) {}
+  explicit SlotWithTag(const MachineInstr &MI)
+      : FI(MI.getOperand(1).getIndex()), Tag(MI.getOperand(4).getImm()) {}
+  bool operator==(const SlotWithTag &Other) const {
+    return FI == Other.FI && Tag == Other.Tag;
+  }
+};
+
+namespace llvm {
+template <> struct DenseMapInfo<SlotWithTag> {
+  static inline SlotWithTag getEmptyKey() { return {-2, -2}; }
+  static inline SlotWithTag getTombstoneKey() { return {-3, -3}; }
+  static unsigned getHashValue(const SlotWithTag &V) {
+    return hash_combine(DenseMapInfo<int>::getHashValue(V.FI),
+                        DenseMapInfo<int>::getHashValue(V.Tag));
+  }
+  static bool isEqual(const SlotWithTag &A, const SlotWithTag &B) {
+    return A == B;
+  }
+};
+} // namespace llvm
+
+static bool isSlotPreAllocated(MachineFrameInfo *MFI, int FI) {
+  return MFI->getUseLocalStackAllocationBlock() &&
+         MFI->isObjectPreAllocated(FI);
+}
+
+// Pin one of the tagged slots to offset 0 from the tagged base pointer.
+// This would make its address available in a virtual register (IRG's def), as
+// opposed to requiring an ADDG instruction to materialize. This effectively
+// eliminates a vreg (by replacing it with direct uses of IRG, which is usually
+// live almost everywhere anyway), and therefore needs to happen before
+// regalloc.
+Optional<int> AArch64StackTaggingPreRA::findFirstSlotCandidate() {
+  // Find the best (FI, Tag) pair to pin to offset 0.
+  // Looking at the possible uses of a tagged address, the advantage of pinning
+  // is:
+  // - COPY to physical register.
+  //   Does not matter, this would trade a MOV instruction for an ADDG.
+  // - ST*G matter, but those mostly appear near the function prologue where all
+  //   the tagged addresses need to be materialized anyway; also, counting ST*G
+  //   uses would overweight large allocas that require more than one ST*G
+  //   instruction.
+  // - Load/Store instructions in the address operand do not require a tagged
+  //   pointer, so they also do not benefit. These operands have already been
+  //   eliminated (see uncheckLoadsAndStores) so all remaining load/store
+  //   instructions count.
+  // - Any other instruction may benefit from being pinned to offset 0.
+  LLVM_DEBUG(dbgs() << "AArch64StackTaggingPreRA::findFirstSlotCandidate\n");
+  if (!ClFirstSlot)
+    return None;
+
+  DenseMap<SlotWithTag, int> RetagScore;
+  SlotWithTag MaxScoreST{-1, -1};
+  int MaxScore = -1;
+  for (auto *I : ReTags) {
+    SlotWithTag ST{*I};
+    if (isSlotPreAllocated(MFI, ST.FI))
+      continue;
+
+    Register RetagReg = I->getOperand(0).getReg();
+    if (!Register::isVirtualRegister(RetagReg))
+      continue;
+
+    int Score = 0;
+    SmallVector<Register, 8> WorkList;
+    WorkList.push_back(RetagReg);
+
+    while (!WorkList.empty()) {
+      Register UseReg = WorkList.back();
+      WorkList.pop_back();
+      for (auto &UseI : MRI->use_instructions(UseReg)) {
+        unsigned Opcode = UseI.getOpcode();
+        if (Opcode == AArch64::STGOffset || Opcode == AArch64::ST2GOffset ||
+            Opcode == AArch64::STZGOffset || Opcode == AArch64::STZ2GOffset ||
+            Opcode == AArch64::STGPi || Opcode == AArch64::STGloop ||
+            Opcode == AArch64::STZGloop || Opcode == AArch64::STGloop_wback ||
+            Opcode == AArch64::STZGloop_wback)
+          continue;
+        if (UseI.isCopy()) {
+          Register DstReg = UseI.getOperand(0).getReg();
+          if (Register::isVirtualRegister(DstReg))
+            WorkList.push_back(DstReg);
+          continue;
+        }
+        LLVM_DEBUG(dbgs() << "[" << ST.FI << ":" << ST.Tag << "] use of %"
+                          << Register::virtReg2Index(UseReg) << " in " << UseI
+                          << "\n");
+        Score++;
+      }
+    }
+
+    int TotalScore = RetagScore[ST] += Score;
+    if (TotalScore > MaxScore ||
+        (TotalScore == MaxScore && ST.FI > MaxScoreST.FI)) {
+      MaxScore = TotalScore;
+      MaxScoreST = ST;
+    }
+  }
+
+  if (MaxScoreST.FI < 0)
+    return None;
+
+  // If FI's tag is already 0, we are done.
+  if (MaxScoreST.Tag == 0)
+    return MaxScoreST.FI;
+
+  // Otherwise, find a random victim pair (FI, Tag) where Tag == 0.
+  SlotWithTag SwapST{-1, -1};
+  for (auto *I : ReTags) {
+    SlotWithTag ST{*I};
+    if (ST.Tag == 0) {
+      SwapST = ST;
+      break;
+    }
+  }
+
+  // Swap tags between the victim and the highest scoring pair.
+  // If SwapWith is still (-1, -1), that's fine, too - we'll simply take tag for
+  // the highest score slot without changing anything else.
+  for (auto *&I : ReTags) {
+    SlotWithTag ST{*I};
+    MachineOperand &TagOp = I->getOperand(4);
+    if (ST == MaxScoreST) {
+      TagOp.setImm(0);
+    } else if (ST == SwapST) {
+      TagOp.setImm(MaxScoreST.Tag);
+    }
+  }
+  return MaxScoreST.FI;
+}
+
 bool AArch64StackTaggingPreRA::runOnMachineFunction(MachineFunction &Func) {
   MF = &Func;
   MRI = &MF->getRegInfo();
@@ -225,11 +366,35 @@ bool AArch64StackTaggingPreRA::runOnMachineFunction(MachineFunction &Func) {
     }
   }
 
+  // Take over from SSP. It does nothing for tagged slots, and should not really
+  // have been enabled in the first place.
+  for (int FI : TaggedSlots)
+    MFI->setObjectSSPLayout(FI, MachineFrameInfo::SSPLK_None);
+
   if (ReTags.empty())
     return false;
 
   if (mayUseUncheckedLoadStore())
     uncheckLoadsAndStores();
 
+  // Find a slot that is used with zero tag offset, like ADDG #fi, 0.
+  // If the base tagged pointer is set up to the address of this slot,
+  // the ADDG instruction can be eliminated.
+  Optional<int> BaseSlot = findFirstSlotCandidate();
+  if (BaseSlot)
+    AFI->setTaggedBasePointerIndex(*BaseSlot);
+
+  for (auto *I : ReTags) {
+    int FI = I->getOperand(1).getIndex();
+    int Tag = I->getOperand(4).getImm();
+    Register Base = I->getOperand(3).getReg();
+    if (Tag == 0 && FI == BaseSlot) {
+      BuildMI(*I->getParent(), I, {}, TII->get(AArch64::COPY),
+              I->getOperand(0).getReg())
+          .addReg(Base);
+      I->eraseFromParent();
+    }
+  }
+
   return true;
 }

diff  --git a/llvm/test/CodeGen/AArch64/irg_sp_tagp.ll b/llvm/test/CodeGen/AArch64/irg_sp_tagp.ll
index 4af79acf2510..fb3b79815440 100644
--- a/llvm/test/CodeGen/AArch64/irg_sp_tagp.ll
+++ b/llvm/test/CodeGen/AArch64/irg_sp_tagp.ll
@@ -3,12 +3,32 @@
 define i8* @small_alloca() {
 entry:
 ; CHECK-LABEL: small_alloca:
+; CHECK:      irg  x0, sp{{$}}
+; CHECK:      ret
+  %a = alloca i8, align 16
+  %q = call i8* @llvm.aarch64.irg.sp(i64 0)
+  %q1 = call i8* @llvm.aarch64.tagp.p0i8(i8* %a, i8* %q, i64 1)
+  ret i8* %q1
+}
+
+ at sink = global i8* null, align 8
+
+; Check that IRG is pinned to %b because the store instruction needs
+; the address in a non-fixed physical register and can benefit from it
+; being equal to the base tagged pointer.
+define i8* @small_allocas() {
+entry:
+; CHECK-LABEL: small_allocas:
 ; CHECK:      irg  [[R:x[0-9]+]], sp{{$}}
-; CHECK-NEXT: addg x0, [[R]], #0, #1
+; CHECK:      addg x0, [[R]], #16, #1
+; CHECK:      str  [[R]], {{.*}}sink
 ; CHECK:      ret
   %a = alloca i8, align 16
+  %b = alloca i8, align 16
   %q = call i8* @llvm.aarch64.irg.sp(i64 0)
   %q1 = call i8* @llvm.aarch64.tagp.p0i8(i8* %a, i8* %q, i64 1)
+  %q2 = call i8* @llvm.aarch64.tagp.p0i8(i8* %b, i8* %q, i64 2)
+  store i8* %q2, i8** @sink, align 8
   ret i8* %q1
 }
 
@@ -16,16 +36,15 @@ entry:
 define void @huge_allocas() {
 entry:
 ; CHECK-LABEL: huge_allocas:
-; CHECK:      irg  [[R:x[0-9]+]], sp{{$}}
-; CHECK:      add  [[TMP:x[0-9]+]], [[R]], #3088
+; CHECK:      irg  x1, sp{{$}}
+; CHECK:      add  [[TMP:x[0-9]+]], x1, #3088
 ; CHECK:      addg x0, [[TMP]], #1008, #1
-; CHECK:      addg x1, [[R]], #0, #2
 ; CHECK:      bl use2
   %a = alloca i8, i64 4096, align 16
   %b = alloca i8, i64 4096, align 16
   %base = call i8* @llvm.aarch64.irg.sp(i64 0)
   %a_t = call i8* @llvm.aarch64.tagp.p0i8(i8* %a, i8* %base, i64 1)
-  %b_t = call i8* @llvm.aarch64.tagp.p0i8(i8* %b, i8* %base, i64 2)
+  %b_t = call i8* @llvm.aarch64.tagp.p0i8(i8* %b, i8* %base, i64 0)
   call void @use2(i8* %a_t, i8* %b_t)
   ret void
 }
@@ -37,8 +56,7 @@ entry:
 ; CHECK-LABEL: realign:
 ; CHECK:      mov  x29, sp
 ; CHECK:      and  sp, x{{[0-9]*}}, #0xffffffffffffffc0
-; CHECK:      irg  [[R:x[0-9]+]], sp{{$}}
-; CHECK:      addg x0, [[R]], #0, #1
+; CHECK:      irg  x0, sp{{$}}
 ; CHECK:      bl use
   %a = alloca i8, i64 4096, align 64
   %base = call i8* @llvm.aarch64.irg.sp(i64 0)
@@ -52,10 +70,9 @@ entry:
 define void @dynamic_alloca(i64 %size) {
 entry:
 ; CHECK-LABEL: dynamic_alloca:
-; CHECK:      sub  [[R:x[0-9]+]], x29, #[[OFS:[0-9]+]]
-; CHECK:      irg  [[R]], [[R]]
-; CHECK:      addg x1, [[R]], #0, #1
-; CHECK:      sub  x0, x29, #[[OFS]]
+; CHECK:      sub  x1, x29, #[[OFS:[0-9]+]]
+; CHECK:      irg  x1, x1
+; CHECK-DAG:  sub  x0, x29, #[[OFS]]
 ; CHECK:      bl   use2
   %base = call i8* @llvm.aarch64.irg.sp(i64 0)
   %a = alloca i128, i64 %size, align 16
@@ -74,9 +91,9 @@ entryz:
 ; CHECK-LABEL: dynamic_alloca_and_realign:
 ; CHECK:      and  sp, x{{.*}}, #0xffffffffffffffc0
 ; CHECK:      mov  x19, sp
-; CHECK:      irg  [[R:x[0-9]+]], x19
-; CHECK:      addg x1, [[R]], #[[OFS:[0-9]+]], #1
-; CHECK:      add  x0, x19, #[[OFS]]
+; CHECK:      add  x1, x19, #[[OFS:[0-9]+]]
+; CHECK:      irg  x1, x1
+; CHECK-DAG:  add  x0, x19, #[[OFS]]
 ; CHECK:      bl   use2
   %base = call i8* @llvm.aarch64.irg.sp(i64 0)
   %a = alloca i128, i64 %size, align 64