[llvm] r369297 - MemTag: stack initializer merging.

Mon Aug 19 13:47:09 PDT 2019

Author: eugenis
Date: Mon Aug 19 13:47:09 2019
New Revision: 369297

URL: http://llvm.org/viewvc/llvm-project?rev=369297&view=rev
Log:
MemTag: stack initializer merging.

Summary:
MTE provides instructions to update memory tags and data at the same
time. This change makes use of those to generate more compact code for
stack variable tagging + initialization.

We collect memory store and memset instructions following an alloca or a
lifetime.start call, and replace them with the corresponding MTE
intrinsics. Since the intrinsics work on 16-byte aligned chunks, the
stored values are combined as necessary.

Reviewers: pcc, vitalybuka, ostannard

Subscribers: srhines, javed.absar, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D66167

Added:
    llvm/trunk/test/CodeGen/AArch64/stack-tagging-initializer-merge.ll
Modified:
    llvm/trunk/lib/Target/AArch64/AArch64.h
    llvm/trunk/lib/Target/AArch64/AArch64StackTagging.cpp
    llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp
    llvm/trunk/test/CodeGen/AArch64/O3-pipeline.ll

Modified: llvm/trunk/lib/Target/AArch64/AArch64.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64.h?rev=369297&r1=369296&r2=369297&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AArch64/AArch64.h (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64.h Mon Aug 19 13:47:09 2019
@@ -56,7 +56,7 @@ InstructionSelector *
 createAArch64InstructionSelector(const AArch64TargetMachine &,
                                  AArch64Subtarget &, AArch64RegisterBankInfo &);
 FunctionPass *createAArch64PreLegalizeCombiner();
-FunctionPass *createAArch64StackTaggingPass();
+FunctionPass *createAArch64StackTaggingPass(bool MergeInit);
 
 void initializeAArch64A53Fix835769Pass(PassRegistry&);
 void initializeAArch64A57FPLoadBalancingPass(PassRegistry&);

Modified: llvm/trunk/lib/Target/AArch64/AArch64StackTagging.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64StackTagging.cpp?rev=369297&r1=369296&r2=369297&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64StackTagging.cpp (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64StackTagging.cpp Mon Aug 19 13:47:09 2019
@@ -55,9 +55,215 @@ using namespace llvm;
 
 #define DEBUG_TYPE "stack-tagging"
 
+static cl::opt<bool> ClMergeInit(
+    "stack-tagging-merge-init", cl::Hidden, cl::init(true), cl::ZeroOrMore,
+    cl::desc("merge stack variable initializers with tagging when possible"));
+
+static cl::opt<unsigned> ClScanLimit("stack-tagging-merge-init-scan-limit",
+                                     cl::init(40), cl::Hidden);
+
 static constexpr unsigned kTagGranuleSize = 16;
 
 namespace {
+
+class InitializerBuilder {
+  uint64_t Size;
+  const DataLayout *DL;
+  Value *BasePtr;
+  Function *SetTagFn;
+  Function *SetTagZeroFn;
+  Function *StgpFn;
+
+  // List of initializers sorted by start offset.
+  struct Range {
+    uint64_t Start, End;
+    Instruction *Inst;
+  };
+  SmallVector<Range, 4> Ranges;
+  // 8-aligned offset => 8-byte initializer
+  // Missing keys are zero initialized.
+  std::map<uint64_t, Value *> Out;
+
+public:
+  InitializerBuilder(uint64_t Size, const DataLayout *DL, Value *BasePtr,
+                     Function *SetTagFn, Function *SetTagZeroFn,
+                     Function *StgpFn)
+      : Size(Size), DL(DL), BasePtr(BasePtr), SetTagFn(SetTagFn),
+        SetTagZeroFn(SetTagZeroFn), StgpFn(StgpFn) {}
+
+  bool addRange(uint64_t Start, uint64_t End, Instruction *Inst) {
+    auto I = std::lower_bound(
+        Ranges.begin(), Ranges.end(), Start,
+        [](const Range &LHS, uint64_t RHS) { return LHS.End <= RHS; });
+    if (I != Ranges.end() && End > I->Start) {
+      // Overlap - bail.
+      return false;
+    }
+    Ranges.insert(I, {Start, End, Inst});
+    return true;
+  }
+
+  bool addStore(uint64_t Offset, StoreInst *SI, const DataLayout *DL) {
+    int64_t StoreSize = DL->getTypeStoreSize(SI->getOperand(0)->getType());
+    if (!addRange(Offset, Offset + StoreSize, SI))
+      return false;
+    IRBuilder<> IRB(SI);
+    applyStore(IRB, Offset, Offset + StoreSize, SI->getOperand(0));
+    return true;
+  }
+
+  bool addMemSet(uint64_t Offset, MemSetInst *MSI) {
+    uint64_t StoreSize = cast<ConstantInt>(MSI->getLength())->getZExtValue();
+    if (!addRange(Offset, Offset + StoreSize, MSI))
+      return false;
+    IRBuilder<> IRB(MSI);
+    applyMemSet(IRB, Offset, Offset + StoreSize,
+                cast<ConstantInt>(MSI->getValue()));
+    return true;
+  }
+
+  void applyMemSet(IRBuilder<> &IRB, int64_t Start, int64_t End,
+                   ConstantInt *V) {
+    // Out[] does not distinguish between zero and undef, and we already know
+    // that this memset does not overlap with any other initializer. Nothing to
+    // do for memset(0).
+    if (V->isZero())
+      return;
+    for (int64_t Offset = Start - Start % 8; Offset < End; Offset += 8) {
+      uint64_t Cst = 0x0101010101010101UL;
+      int LowBits = Offset < Start ? (Start - Offset) * 8 : 0;
+      if (LowBits)
+        Cst = (Cst >> LowBits) << LowBits;
+      int HighBits = End - Offset < 8 ? (8 - (End - Offset)) * 8 : 0;
+      if (HighBits)
+        Cst = (Cst << HighBits) >> HighBits;
+      ConstantInt *C =
+          ConstantInt::get(IRB.getInt64Ty(), Cst * V->getZExtValue());
+
+      Value *&CurrentV = Out[Offset];
+      if (!CurrentV) {
+        CurrentV = C;
+      } else {
+        CurrentV = IRB.CreateOr(CurrentV, C);
+      }
+    }
+  }
+
+  // Take a 64-bit slice of the value starting at the given offset (in bytes).
+  // Offset can be negative. Pad with zeroes on both sides when necessary.
+  Value *sliceValue(IRBuilder<> &IRB, Value *V, int64_t Offset) {
+    if (Offset > 0) {
+      V = IRB.CreateLShr(V, Offset * 8);
+      V = IRB.CreateZExtOrTrunc(V, IRB.getInt64Ty());
+    } else if (Offset < 0) {
+      V = IRB.CreateZExtOrTrunc(V, IRB.getInt64Ty());
+      V = IRB.CreateShl(V, -Offset * 8);
+    } else {
+      V = IRB.CreateZExtOrTrunc(V, IRB.getInt64Ty());
+    }
+    return V;
+  }
+
+  void applyStore(IRBuilder<> &IRB, int64_t Start, int64_t End,
+                  Value *StoredValue) {
+    StoredValue = flatten(IRB, StoredValue);
+    for (int64_t Offset = Start - Start % 8; Offset < End; Offset += 8) {
+      Value *V = sliceValue(IRB, StoredValue, Offset - Start);
+      Value *&CurrentV = Out[Offset];
+      if (!CurrentV) {
+        CurrentV = V;
+      } else {
+        CurrentV = IRB.CreateOr(CurrentV, V);
+      }
+    }
+  }
+
+  void generate(IRBuilder<> &IRB) {
+    LLVM_DEBUG(dbgs() << "Combined initializer\n");
+    // No initializers => the entire allocation is undef.
+    if (Ranges.empty()) {
+      emitUndef(IRB, 0, Size);
+      return;
+    }
+
+    // Look through 8-byte initializer list 16 bytes at a time;
+    // If one of the two 8-byte halfs is non-zero non-undef, emit STGP.
+    // Otherwise, emit zeroes up to next available item.
+    uint64_t LastOffset = 0;
+    for (uint64_t Offset = 0; Offset < Size; Offset += 16) {
+      auto I1 = Out.find(Offset);
+      auto I2 = Out.find(Offset + 8);
+      if (I1 == Out.end() && I2 == Out.end())
+        continue;
+
+      if (Offset > LastOffset)
+        emitZeroes(IRB, LastOffset, Offset - LastOffset);
+
+      Value *Store1 = I1 == Out.end() ? Constant::getNullValue(IRB.getInt64Ty())
+                                      : I1->second;
+      Value *Store2 = I2 == Out.end() ? Constant::getNullValue(IRB.getInt64Ty())
+                                      : I2->second;
+      emitPair(IRB, Offset, Store1, Store2);
+      LastOffset = Offset + 16;
+    }
+
+    // memset(0) does not update Out[], therefore the tail can be either undef
+    // or zero.
+    if (LastOffset < Size)
+      emitZeroes(IRB, LastOffset, Size - LastOffset);
+
+    for (const auto &R : Ranges) {
+      R.Inst->eraseFromParent();
+    }
+  }
+
+  void emitZeroes(IRBuilder<> &IRB, uint64_t Offset, uint64_t Size) {
+    LLVM_DEBUG(dbgs() << "  [" << Offset << ", " << Offset + Size
+                      << ") zero\n");
+    Value *Ptr = BasePtr;
+    if (Offset)
+      Ptr = IRB.CreateConstGEP1_32(Ptr, Offset);
+    IRB.CreateCall(SetTagZeroFn,
+                   {Ptr, ConstantInt::get(IRB.getInt64Ty(), Size)});
+  }
+
+  void emitUndef(IRBuilder<> &IRB, uint64_t Offset, uint64_t Size) {
+    LLVM_DEBUG(dbgs() << "  [" << Offset << ", " << Offset + Size
+                      << ") undef\n");
+    Value *Ptr = BasePtr;
+    if (Offset)
+      Ptr = IRB.CreateConstGEP1_32(Ptr, Offset);
+    IRB.CreateCall(SetTagFn, {Ptr, ConstantInt::get(IRB.getInt64Ty(), Size)});
+  }
+
+  void emitPair(IRBuilder<> &IRB, uint64_t Offset, Value *A, Value *B) {
+    LLVM_DEBUG(dbgs() << "  [" << Offset << ", " << Offset + 16 << "):\n");
+    LLVM_DEBUG(dbgs() << "    " << *A << "\n    " << *B << "\n");
+    Value *Ptr = BasePtr;
+    if (Offset)
+      Ptr = IRB.CreateConstGEP1_32(Ptr, Offset);
+    IRB.CreateCall(StgpFn, {Ptr, A, B});
+  }
+
+  Value *flatten(IRBuilder<> &IRB, Value *V) {
+    if (V->getType()->isIntegerTy())
+      return V;
+    // vector of pointers -> vector of ints
+    if (VectorType *VecTy = dyn_cast<VectorType>(V->getType())) {
+      LLVMContext &Ctx = IRB.getContext();
+      Type *EltTy = VecTy->getElementType();
+      if (EltTy->isPointerTy()) {
+        uint32_t EltSize = DL->getTypeSizeInBits(EltTy);
+        Type *NewTy = VectorType::get(IntegerType::get(Ctx, EltSize),
+                                      VecTy->getNumElements());
+        V = IRB.CreatePointerCast(V, NewTy);
+      }
+    }
+    return IRB.CreateBitOrPointerCast(
+        V, IRB.getIntNTy(DL->getTypeStoreSize(V->getType()) * 8));
+  }
+};
+
 class AArch64StackTagging : public FunctionPass {
   struct AllocaInfo {
     AllocaInst *AI;
@@ -67,10 +273,15 @@ class AArch64StackTagging : public Funct
     int Tag; // -1 for non-tagged allocations
   };
 
+  bool MergeInit;
+
 public:
   static char ID; // Pass ID, replacement for typeid
 
-  AArch64StackTagging() : FunctionPass(ID) {
+  AArch64StackTagging(bool MergeInit = true)
+      : FunctionPass(ID),
+        MergeInit(ClMergeInit.getNumOccurrences() > 0 ? ClMergeInit
+                                                      : MergeInit) {
     initializeAArch64StackTaggingPass(*PassRegistry::getPassRegistry());
   }
 
@@ -81,6 +292,9 @@ public:
                  uint64_t Size);
   void untagAlloca(AllocaInst *AI, Instruction *InsertBefore, uint64_t Size);
 
+  Instruction *collectInitializers(Instruction *StartInst, Value *StartPtr,
+                                   uint64_t Size, InitializerBuilder &IB);
+
   Instruction *
   insertBaseTaggedPointer(const MapVector<AllocaInst *, AllocaInfo> &Allocas,
                           const DominatorTree *DT);
@@ -92,9 +306,12 @@ private:
   Function *F;
   Function *SetTagFunc;
   const DataLayout *DL;
+  AAResults *AA;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    if (MergeInit)
+      AU.addRequired<AAResultsWrapperPass>();
   }
 };
 
@@ -107,8 +324,68 @@ INITIALIZE_PASS_BEGIN(AArch64StackTaggin
 INITIALIZE_PASS_END(AArch64StackTagging, DEBUG_TYPE, "AArch64 Stack Tagging",
                     false, false)
 
-FunctionPass *llvm::createAArch64StackTaggingPass() {
-  return new AArch64StackTagging();
+FunctionPass *llvm::createAArch64StackTaggingPass(bool MergeInit) {
+  return new AArch64StackTagging(MergeInit);
+}
+
+Instruction *AArch64StackTagging::collectInitializers(Instruction *StartInst,
+                                                      Value *StartPtr,
+                                                      uint64_t Size,
+                                                      InitializerBuilder &IB) {
+  MemoryLocation AllocaLoc{StartPtr, Size};
+  Instruction *LastInst = StartInst;
+  BasicBlock::iterator BI(StartInst);
+
+  unsigned Count = 0;
+  for (; Count < ClScanLimit && !BI->isTerminator(); ++BI) {
+    if (!isa<DbgInfoIntrinsic>(*BI))
+      ++Count;
+
+    if (isNoModRef(AA->getModRefInfo(&*BI, AllocaLoc)))
+      continue;
+
+    if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) {
+      // If the instruction is readnone, ignore it, otherwise bail out.  We
+      // don't even allow readonly here because we don't want something like:
+      // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
+      if (BI->mayWriteToMemory() || BI->mayReadFromMemory())
+        break;
+      continue;
+    }
+
+    if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) {
+      if (!NextStore->isSimple())
+        break;
+
+      // Check to see if this store is to a constant offset from the start ptr.
+      int64_t Offset;
+      if (!isPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset,
+                           *DL))
+        break;
+
+      if (!IB.addStore(Offset, NextStore, DL))
+        break;
+      LastInst = NextStore;
+    } else {
+      MemSetInst *MSI = cast<MemSetInst>(BI);
+
+      if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
+        break;
+
+      if (!isa<ConstantInt>(MSI->getValue()))
+        break;
+
+      // Check to see if this store is to a constant offset from the start ptr.
+      int64_t Offset;
+      if (!isPointerOffset(StartPtr, MSI->getDest(), Offset, *DL))
+        break;
+
+      if (!IB.addMemSet(Offset, MSI))
+        break;
+      LastInst = MSI;
+    }
+  }
+  return LastInst;
 }
 
 bool AArch64StackTagging::isInterestingAlloca(const AllocaInst &AI) {
@@ -127,8 +404,23 @@ bool AArch64StackTagging::isInterestingA
 
 void AArch64StackTagging::tagAlloca(AllocaInst *AI, Instruction *InsertBefore,
                                     Value *Ptr, uint64_t Size) {
+  auto SetTagZeroFunc =
+      Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag_zero);
+  auto StgpFunc =
+      Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_stgp);
+
+  InitializerBuilder IB(Size, DL, Ptr, SetTagFunc, SetTagZeroFunc, StgpFunc);
+  bool LittleEndian =
+      Triple(AI->getModule()->getTargetTriple()).isLittleEndian();
+  // Current implementation of initializer merging assumes little endianness.
+  if (MergeInit && !F->hasOptNone() && LittleEndian) {
+    LLVM_DEBUG(dbgs() << "collecting initializers for " << *AI
+                      << ", size = " << Size << "\n");
+    InsertBefore = collectInitializers(InsertBefore, Ptr, Size, IB);
+  }
+
   IRBuilder<> IRB(InsertBefore);
-  IRB.CreateCall(SetTagFunc, {Ptr, ConstantInt::get(IRB.getInt64Ty(), Size)});
+  IB.generate(IRB);
 }
 
 void AArch64StackTagging::untagAlloca(AllocaInst *AI, Instruction *InsertBefore,
@@ -205,6 +497,8 @@ bool AArch64StackTagging::runOnFunction(
 
   F = &Fn;
   DL = &Fn.getParent()->getDataLayout();
+  if (MergeInit)
+    AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   MapVector<AllocaInst *, AllocaInfo> Allocas; // need stable iteration order
   SmallVector<Instruction *, 8> RetVec;

Modified: llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp?rev=369297&r1=369296&r2=369297&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp Mon Aug 19 13:47:09 2019
@@ -448,7 +448,8 @@ void AArch64PassConfig::addIRPasses() {
     addPass(createLICMPass());
   }
 
-  addPass(createAArch64StackTaggingPass());
+  addPass(createAArch64StackTaggingPass(/* MergeInit = */ TM->getOptLevel() !=
+                                        CodeGenOpt::None));
 }
 
 // Pass Pipeline Configuration

Modified: llvm/trunk/test/CodeGen/AArch64/O3-pipeline.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/O3-pipeline.ll?rev=369297&r1=369296&r2=369297&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/O3-pipeline.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/O3-pipeline.ll Mon Aug 19 13:47:09 2019
@@ -55,6 +55,8 @@
 ; CHECK-NEXT:       Interleaved Load Combine Pass
 ; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Interleaved Access Pass
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       AArch64 Stack Tagging
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       CodeGen Prepare

Added: llvm/trunk/test/CodeGen/AArch64/stack-tagging-initializer-merge.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/stack-tagging-initializer-merge.ll?rev=369297&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/stack-tagging-initializer-merge.ll (added)
+++ llvm/trunk/test/CodeGen/AArch64/stack-tagging-initializer-merge.ll Mon Aug 19 13:47:09 2019
@@ -0,0 +1,308 @@
+; RUN: opt < %s -stack-tagging -S -o - | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android"
+
+declare void @use(i8*)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg)
+
+define void @OneVarNoInit() sanitize_memtag {
+entry:
+  %x = alloca i32, align 4
+  %0 = bitcast i32* %x to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0)
+  call void @use(i8* nonnull %0)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0)
+  ret void
+}
+
+; CHECK-LABEL: define void @OneVarNoInit(
+; CHECK-DAG:  [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
+; CHECK-DAG:  [[TX:%.*]] = call { i32, [12 x i8] }* @llvm.aarch64.tagp.{{.*}}({ i32, [12 x i8] }* [[X]], {{.*}}, i64 0)
+; CHECK-DAG:  [[TX32:%.*]] = bitcast { i32, [12 x i8] }* [[TX]] to i32*
+; CHECK-DAG:  [[TX8:%.*]] = bitcast i32* [[TX32]] to i8*
+; CHECK-DAG:  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull [[TX8]])
+; CHECK-DAG:  call void @llvm.aarch64.settag(i8* [[TX8]], i64 16)
+; CHECK-DAG:  call void @use(i8* nonnull [[TX8]])
+; CHECK-DAG:  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TX8]])
+
+define void @OneVarInitConst() sanitize_memtag {
+entry:
+  %x = alloca i32, align 4
+  %0 = bitcast i32* %x to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0)
+  store i32 42, i32* %x, align 4
+  call void @use(i8* nonnull %0)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0)
+  ret void
+}
+
+; CHECK-LABEL: define void @OneVarInitConst(
+; CHECK:  [[TX:%.*]] = call { i32, [12 x i8] }* @llvm.aarch64.tagp
+; CHECK:  [[TX32:%.*]] = bitcast { i32, [12 x i8] }* [[TX]] to i32*
+; CHECK:  [[TX8:%.*]] = bitcast i32* [[TX32]] to i8*
+; CHECK-NOT: aarch64.settag
+; CHECK:  call void @llvm.aarch64.stgp(i8* [[TX8]], i64 42, i64 0)
+; Untagging before lifetime.end:
+; CHECK:  call void @llvm.aarch64.settag(
+; CHECK-NOT: aarch64.settag
+; CHECK:  ret void
+
+define void @ArrayInitConst() sanitize_memtag {
+entry:
+  %x = alloca i32, i32 16, align 4
+  %0 = bitcast i32* %x to i8*
+  call void @llvm.lifetime.start.p0i8(i64 64, i8* nonnull %0)
+  store i32 42, i32* %x, align 4
+  call void @use(i8* nonnull %0)
+  call void @llvm.lifetime.end.p0i8(i64 64, i8* nonnull %0)
+  ret void
+}
+
+; CHECK-LABEL: define void @ArrayInitConst(
+; CHECK:  [[TX:%.*]] = call i32* @llvm.aarch64.tagp.
+; CHECK:  [[TX8:%.*]] = bitcast i32* [[TX]] to i8*
+; CHECK:  call void @llvm.aarch64.stgp(i8* [[TX8]], i64 42, i64 0)
+; CHECK:  [[TX8_16:%.*]] = getelementptr i8, i8* [[TX8]], i32 16
+; CHECK:  call void @llvm.aarch64.settag.zero(i8* [[TX8_16]], i64 48)
+; CHECK:  ret void
+
+define void @ArrayInitConst2() sanitize_memtag {
+entry:
+  %x = alloca i32, i32 16, align 4
+  %0 = bitcast i32* %x to i8*
+  call void @llvm.lifetime.start.p0i8(i64 64, i8* nonnull %0)
+  store i32 42, i32* %x, align 4
+  %1 = getelementptr i32, i32* %x, i32 1
+  store i32 43, i32* %1, align 4
+  %2 = getelementptr i32, i32* %x, i32 2
+  %3 = bitcast i32* %2 to i64*
+  store i64 -1, i64* %3, align 4
+  call void @use(i8* nonnull %0)
+  call void @llvm.lifetime.end.p0i8(i64 64, i8* nonnull %0)
+  ret void
+}
+
+; CHECK-LABEL: define void @ArrayInitConst2(
+; CHECK:  [[TX:%.*]] = call i32* @llvm.aarch64.tagp.
+; CHECK:  [[TX8:%.*]] = bitcast i32* [[TX]] to i8*
+; CHECK:  call void @llvm.aarch64.stgp(i8* [[TX8]], i64 184683593770, i64 -1)
+; CHECK:  [[TX8_16:%.*]] = getelementptr i8, i8* [[TX8]], i32 16
+; CHECK:  call void @llvm.aarch64.settag.zero(i8* [[TX8_16]], i64 48)
+; CHECK:  ret void
+
+define void @ArrayInitConstSplit() sanitize_memtag {
+entry:
+  %x = alloca i32, i32 16, align 4
+  %0 = bitcast i32* %x to i8*
+  call void @llvm.lifetime.start.p0i8(i64 64, i8* nonnull %0)
+  %1 = getelementptr i32, i32* %x, i32 1
+  %2 = bitcast i32* %1 to i64*
+  store i64 -1, i64* %2, align 4
+  call void @use(i8* nonnull %0)
+  call void @llvm.lifetime.end.p0i8(i64 64, i8* nonnull %0)
+  ret void
+}
+
+; CHECK-LABEL: define void @ArrayInitConstSplit(
+; CHECK:  [[TX:%.*]] = call i32* @llvm.aarch64.tagp.
+; CHECK:  [[TX8:%.*]] = bitcast i32* [[TX]] to i8*
+; CHECK:  call void @llvm.aarch64.stgp(i8* [[TX8]], i64 -4294967296, i64 4294967295)
+; CHECK:  ret void
+
+define void @ArrayInitConstWithHoles() sanitize_memtag {
+entry:
+  %x = alloca i32, i32 32, align 4
+  %0 = bitcast i32* %x to i8*
+  call void @llvm.lifetime.start.p0i8(i64 128, i8* nonnull %0)
+  %1 = getelementptr i32, i32* %x, i32 5
+  store i32 42, i32* %1, align 4
+  %2 = getelementptr i32, i32* %x, i32 14
+  store i32 43, i32* %2, align 4
+  call void @use(i8* nonnull %0)
+  call void @llvm.lifetime.end.p0i8(i64 128, i8* nonnull %0)
+  ret void
+}
+
+; CHECK-LABEL: define void @ArrayInitConstWithHoles(
+; CHECK:  [[TX:%.*]] = call i32* @llvm.aarch64.tagp.
+; CHECK:  [[TX8:%.*]] = bitcast i32* [[TX]] to i8*
+; CHECK:  call void @llvm.aarch64.settag.zero(i8* [[TX8]], i64 16)
+; CHECK:  [[TX8_16:%.*]] = getelementptr i8, i8* %0, i32 16
+; CHECK:  call void @llvm.aarch64.stgp(i8* [[TX8_16]], i64 180388626432, i64 0)
+; CHECK:  [[TX8_32:%.*]] = getelementptr i8, i8* %0, i32 32
+; CHECK:  call void @llvm.aarch64.settag.zero(i8* [[TX8_32]], i64 16)
+; CHECK:  [[TX8_48:%.*]] = getelementptr i8, i8* %0, i32 48
+; CHECK:  call void @llvm.aarch64.stgp(i8* [[TX8_48]], i64 0, i64 43)
+; CHECK:  [[TX8_64:%.*]] = getelementptr i8, i8* %0, i32 64
+; CHECK:  call void @llvm.aarch64.settag.zero(i8* [[TX8_64]], i64 64)
+; CHECK:  ret void
+
+define void @InitNonConst(i32 %v) sanitize_memtag {
+entry:
+  %x = alloca i32, align 4
+  %0 = bitcast i32* %x to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0)
+  store i32 %v, i32* %x, align 4
+  call void @use(i8* nonnull %0)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0)
+  ret void
+}
+
+; CHECK-LABEL: define void @InitNonConst(
+; CHECK:  [[TX:%.*]] = call { i32, [12 x i8] }* @llvm.aarch64.tagp
+; CHECK:  [[TX32:%.*]] = bitcast { i32, [12 x i8] }* [[TX]] to i32*
+; CHECK:  [[TX8:%.*]] = bitcast i32* [[TX32]] to i8*
+; CHECK:  [[V:%.*]] = zext i32 %v to i64
+; CHECK:  call void @llvm.aarch64.stgp(i8* [[TX8]], i64 [[V]], i64 0)
+; CHECK:  ret void
+
+define void @InitNonConst2(i32 %v, i32 %w) sanitize_memtag {
+entry:
+  %x = alloca i32, i32 4, align 4
+  %0 = bitcast i32* %x to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull %0)
+  store i32 %v, i32* %x, align 4
+  %1 = getelementptr i32, i32* %x, i32 1
+  store i32 %w, i32* %1, align 4
+  call void @use(i8* nonnull %0)
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull %0)
+  ret void
+}
+
+; CHECK-LABEL: define void @InitNonConst2(
+; CHECK:  [[TX:%.*]] = call i32* @llvm.aarch64.tagp
+; CHECK:  [[TX8:%.*]] = bitcast i32* [[TX]] to i8*
+; CHECK:  [[V:%.*]] = zext i32 %v to i64
+; CHECK:  [[W:%.*]] = zext i32 %w to i64
+; CHECK:  [[WS:%.*]] = shl i64 [[W]], 32
+; CHECK:  [[VW:%.*]] = or i64 [[V]], [[WS]]
+; CHECK:  call void @llvm.aarch64.stgp(i8* [[TX8]], i64 [[VW]], i64 0)
+; CHECK:  ret void
+
+define void @InitVector() sanitize_memtag {
+entry:
+  %x = alloca i32, i32 4, align 4
+  %0 = bitcast i32* %x to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull %0)
+  %1 = bitcast i32* %x to <2 x i32>*
+  store <2 x i32> <i32 1, i32 2>, <2 x i32>* %1, align 4
+  call void @use(i8* nonnull %0)
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull %0)
+  ret void
+}
+
+; CHECK-LABEL: define void @InitVector(
+; CHECK:  [[TX:%.*]] = call i32* @llvm.aarch64.tagp
+; CHECK:  [[TX8:%.*]] = bitcast i32* [[TX]] to i8*
+; CHECK:  call void @llvm.aarch64.stgp(i8* [[TX8]], i64 bitcast (<2 x i32> <i32 1, i32 2> to i64), i64 0)
+; CHECK:  ret void
+
+define void @InitVectorPtr(i32* %p) sanitize_memtag {
+entry:
+  %s = alloca <4 x i32*>, align 8
+  %v0 = insertelement <4 x i32*> undef, i32* %p, i32 0
+  %v1 = shufflevector <4 x i32*> %v0, <4 x i32*> undef, <4 x i32> zeroinitializer
+  store <4 x i32*> %v1, <4 x i32*>* %s
+  %0 = bitcast <4 x i32*>* %s to i8*
+  call void @use(i8* nonnull %0)
+  ret void
+}
+
+; CHECK-LABEL: define void @InitVectorPtr(
+; CHECK:  call <4 x i32*>* @llvm.aarch64.tagp
+; CHECK:  [[V1:%.*]] = shufflevector
+; CHECK:  [[V2:%.*]] = ptrtoint <4 x i32*> [[V1]] to <4 x i64>
+; CHECK:  [[V3:%.*]] = bitcast <4 x i64> [[V2]] to i256
+; CHECK:  [[A1:%.*]] = trunc i256 [[V3]] to i64
+; CHECK:  [[A2_:%.*]] = lshr i256 [[V3]], 64
+; CHECK:  [[A2:%.*]] = trunc i256 [[A2_]] to i64
+; CHECK:  [[A3_:%.*]] = lshr i256 [[V3]], 128
+; CHECK:  [[A3:%.*]] = trunc i256 [[A3_]] to i64
+; CHECK:  [[A4_:%.*]] = lshr i256 [[V3]], 192
+; CHECK:  [[A4:%.*]] = trunc i256 [[A4_]] to i64
+; CHECK:  call void @llvm.aarch64.stgp({{.*}}, i64 [[A1]], i64 [[A2]])
+; CHECK:  call void @llvm.aarch64.stgp({{.*}}, i64 [[A3]], i64 [[A4]])
+; CHECK:  ret void
+
+define void @InitVectorSplit() sanitize_memtag {
+entry:
+  %x = alloca i32, i32 4, align 4
+  %0 = bitcast i32* %x to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull %0)
+  %1 = getelementptr i32, i32* %x, i32 1
+  %2 = bitcast i32* %1 to <2 x i32>*
+  store <2 x i32> <i32 1, i32 2>, <2 x i32>* %2, align 4
+  call void @use(i8* nonnull %0)
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull %0)
+  ret void
+}
+
+; CHECK-LABEL: define void @InitVectorSplit(
+; CHECK:  [[TX:%.*]] = call i32* @llvm.aarch64.tagp
+; CHECK:  [[TX8:%.*]] = bitcast i32* [[TX]] to i8*
+; CHECK:  call void @llvm.aarch64.stgp(i8* [[TX8]], i64 shl (i64 bitcast (<2 x i32> <i32 1, i32 2> to i64), i64 32), i64 lshr (i64 bitcast (<2 x i32> <i32 1, i32 2> to i64), i64 32))
+; CHECK:  ret void
+
+define void @MemSetZero() sanitize_memtag {
+entry:
+  %x = alloca i32, i32 8, align 16
+  %0 = bitcast i32* %x to i8*
+  call void @llvm.memset.p0i8.i64(i8* nonnull align 16 %0, i8 0, i64 32, i1 false)
+  call void @use(i8* nonnull %0)
+  ret void
+}
+
+; CHECK-LABEL: define void @MemSetZero(
+; CHECK:  [[TX:%.*]] = call i32* @llvm.aarch64.tagp
+; CHECK:  [[TX8:%.*]] = bitcast i32* [[TX]] to i8*
+; CHECK:  call void @llvm.aarch64.settag.zero(i8* [[TX8]], i64 32)
+; CHECK:  ret void
+
+
+define void @MemSetNonZero() sanitize_memtag {
+entry:
+  %x = alloca i32, i32 8, align 16
+  %0 = bitcast i32* %x to i8*
+  call void @llvm.memset.p0i8.i64(i8* nonnull align 16 %0, i8 42, i64 32, i1 false)
+  call void @use(i8* nonnull %0)
+  ret void
+}
+
+; CHECK-LABEL: define void @MemSetNonZero(
+; CHECK:  call void @llvm.aarch64.stgp(i8* {{.*}}, i64 3038287259199220266, i64 3038287259199220266)
+; CHECK:  call void @llvm.aarch64.stgp(i8* {{.*}}, i64 3038287259199220266, i64 3038287259199220266)
+; CHECK:  ret void
+
+
+define void @MemSetNonZero2() sanitize_memtag {
+entry:
+  %x = alloca [32 x i8], align 16
+  %0 = getelementptr inbounds [32 x i8], [32 x i8]* %x, i64 0, i64 2
+  call void @llvm.memset.p0i8.i64(i8* nonnull %0, i8 42, i64 28, i1 false)
+  call void @use(i8* nonnull %0)
+  ret void
+}
+
+; CHECK-LABEL: define void @MemSetNonZero2(
+; CHECK:  call void @llvm.aarch64.stgp(i8* {{.*}}, i64 3038287259199209472, i64 3038287259199220266)
+; CHECK:  call void @llvm.aarch64.stgp(i8* {{.*}}, i64 3038287259199220266, i64 46360584399402)
+; CHECK:  ret void
+
+define void @MemSetNonZero3() sanitize_memtag {
+entry:
+  %x = alloca [32 x i8], align 16
+  %0 = getelementptr inbounds [32 x i8], [32 x i8]* %x, i64 0, i64 2
+  call void @llvm.memset.p0i8.i64(i8* nonnull %0, i8 42, i64 4, i1 false)
+  %1 = getelementptr inbounds [32 x i8], [32 x i8]* %x, i64 0, i64 24
+  call void @llvm.memset.p0i8.i64(i8* nonnull %1, i8 42, i64 8, i1 false)
+  call void @use(i8* nonnull %0)
+  ret void
+}
+
+; CHECK-LABEL: define void @MemSetNonZero3(
+; CHECK:  call void @llvm.aarch64.stgp(i8* {{.*}}, i64 46360584388608, i64 0)
+; CHECK:  call void @llvm.aarch64.stgp(i8* {{.*}}, i64 0, i64 3038287259199220266)
+; CHECK:  ret void