[Mlir-commits] [mlir] 4f04db4 - AllocaInst should store Align instead of MaybeAlign.

Eli Friedman llvmlistbot at llvm.org
Sat May 16 14:53:35 PDT 2020


Author: Eli Friedman
Date: 2020-05-16T14:53:16-07:00
New Revision: 4f04db4b5439f390c48408f9b94875810e88ffc6

URL: https://github.com/llvm/llvm-project/commit/4f04db4b5439f390c48408f9b94875810e88ffc6
DIFF: https://github.com/llvm/llvm-project/commit/4f04db4b5439f390c48408f9b94875810e88ffc6.diff

LOG: AllocaInst should store Align instead of MaybeAlign.

Along the lines of D77454 and D79968.  Unlike loads and stores, the
default alignment is getPrefTypeAlign, to match the existing handling in
various places, including SelectionDAG and InstCombine.

Differential Revision: https://reviews.llvm.org/D80044

Added: 
    

Modified: 
    llvm/include/llvm/IR/Instructions.h
    llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
    llvm/lib/AsmParser/LLParser.cpp
    llvm/lib/Bitcode/Reader/BitcodeReader.cpp
    llvm/lib/CodeGen/CodeGenPrepare.cpp
    llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
    llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
    llvm/lib/IR/Core.cpp
    llvm/lib/IR/Instructions.cpp
    llvm/lib/Target/AArch64/AArch64StackTagging.cpp
    llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
    llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
    llvm/lib/Transforms/Coroutines/CoroElide.cpp
    llvm/lib/Transforms/Coroutines/CoroFrame.cpp
    llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
    llvm/lib/Transforms/IPO/AttributorAttributes.cpp
    llvm/lib/Transforms/IPO/Inliner.cpp
    llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
    llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
    llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
    llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
    llvm/lib/Transforms/Scalar/GVNHoist.cpp
    llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
    llvm/lib/Transforms/Scalar/SROA.cpp
    llvm/lib/Transforms/Utils/Local.cpp
    llvm/test/Assembler/alloca-addrspace-elems.ll
    llvm/test/Assembler/alloca-addrspace0.ll
    llvm/test/Assembler/block-labels.ll
    llvm/test/Assembler/datalayout-alloca-addrspace-mismatch-0.ll
    llvm/test/Assembler/datalayout-alloca-addrspace.ll
    llvm/test/Assembler/drop-debug-info-nonzero-alloca.ll
    llvm/test/CodeGen/AMDGPU/alloca.ll
    llvm/test/CodeGen/AMDGPU/lower-kernargs.ll
    llvm/test/Transforms/ArgumentPromotion/attrs.ll
    llvm/test/Transforms/ArgumentPromotion/byval-2.ll
    llvm/test/Transforms/ArgumentPromotion/byval.ll
    llvm/test/Transforms/ArgumentPromotion/dbg.ll
    llvm/test/Transforms/ArgumentPromotion/tail.ll
    llvm/test/Transforms/Attributor/ArgumentPromotion/attrs.ll
    llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll
    llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll
    llvm/test/Transforms/Attributor/ArgumentPromotion/fp80.ll
    llvm/test/Transforms/Attributor/ArgumentPromotion/inalloca.ll
    llvm/test/Transforms/Attributor/ArgumentPromotion/tail.ll
    llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll
    llvm/test/Transforms/Attributor/value-simplify.ll
    llvm/test/Transforms/NewGVN/pr33367.ll
    llvm/test/Transforms/SROA/alignment.ll
    llvm/test/Transforms/SROA/pointer-offset-size.ll
    mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
    polly/lib/CodeGen/BlockGenerators.cpp
    polly/lib/CodeGen/IslNodeBuilder.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index 090674b52353..522441be4626 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -66,21 +66,19 @@ class AllocaInst : public UnaryInstruction {
   AllocaInst *cloneImpl() const;
 
 public:
-  explicit AllocaInst(Type *Ty, unsigned AddrSpace,
-                      Value *ArraySize = nullptr,
-                      const Twine &Name = "",
-                      Instruction *InsertBefore = nullptr);
+  explicit AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
+                      const Twine &Name, Instruction *InsertBefore);
   AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
              const Twine &Name, BasicBlock *InsertAtEnd);
 
-  AllocaInst(Type *Ty, unsigned AddrSpace,
-             const Twine &Name, Instruction *InsertBefore = nullptr);
+  AllocaInst(Type *Ty, unsigned AddrSpace, const Twine &Name,
+             Instruction *InsertBefore);
   AllocaInst(Type *Ty, unsigned AddrSpace,
              const Twine &Name, BasicBlock *InsertAtEnd);
 
-  AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize, MaybeAlign Align,
+  AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize, Align Align,
              const Twine &Name = "", Instruction *InsertBefore = nullptr);
-  AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize, MaybeAlign Align,
+  AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize, Align Align,
              const Twine &Name, BasicBlock *InsertAtEnd);
 
   /// Return true if there is an allocation size parameter to the allocation
@@ -109,16 +107,12 @@ class AllocaInst : public UnaryInstruction {
 
   /// Return the alignment of the memory that is being allocated by the
   /// instruction.
-  MaybeAlign getAlign() const {
-    return decodeMaybeAlign(getSubclassDataFromInstruction() & 31);
+  Align getAlign() const {
+    return *decodeMaybeAlign(getSubclassDataFromInstruction() & 31);
   }
   // FIXME: Remove this one transition to Align is over.
-  unsigned getAlignment() const {
-    if (const auto MA = getAlign())
-      return MA->value();
-    return 0;
-  }
-  void setAlignment(MaybeAlign Align);
+  unsigned getAlignment() const { return getAlign().value(); }
+  void setAlignment(Align Align);
 
   /// Return true if this alloca is in the entry block of the function and is a
   /// constant size. If so, the code generator will fold it into the

diff  --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
index 41180c5c678d..c1910358fc80 100644
--- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -61,7 +61,7 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
   bool processMemCpy(MemCpyInst *M);
   bool processMemMove(MemMoveInst *M);
   bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc,
-                            uint64_t cpyLen, unsigned cpyAlign, CallInst *C);
+                            uint64_t cpyLen, Align cpyAlign, CallInst *C);
   bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep);
   bool processMemSetMemCpyDependence(MemCpyInst *M, MemSetInst *MDep);
   bool performMemCpyToMemSetOptzn(MemCpyInst *M, MemSetInst *MDep);

diff  --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index c0db5884bc37..9e8fe96ac3a8 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -7007,7 +7007,12 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
   if (Size && !Size->getType()->isIntegerTy())
     return Error(SizeLoc, "element count must have integer type");
 
-  AllocaInst *AI = new AllocaInst(Ty, AddrSpace, Size, Alignment);
+  SmallPtrSet<Type *, 4> Visited;
+  if (!Alignment && !Ty->isSized(&Visited))
+    return Error(TyLoc, "Cannot allocate unsized type");
+  if (!Alignment)
+    Alignment = M->getDataLayout().getPrefTypeAlign(Ty);
+  AllocaInst *AI = new AllocaInst(Ty, AddrSpace, Size, *Alignment);
   AI->setUsedWithInAlloca(IsInAlloca);
   AI->setSwiftError(IsSwiftError);
   Inst = AI;

diff  --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 7b62bab3041d..e95d7b1ee8a9 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -4826,7 +4826,13 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       const DataLayout &DL = TheModule->getDataLayout();
       unsigned AS = DL.getAllocaAddrSpace();
 
-      AllocaInst *AI = new AllocaInst(Ty, AS, Size, Align);
+      SmallPtrSet<Type *, 4> Visited;
+      if (!Align && !Ty->isSized(&Visited))
+        return error("alloca of unsized type");
+      if (!Align)
+        Align = DL.getPrefTypeAlign(Ty);
+
+      AllocaInst *AI = new AllocaInst(Ty, AS, Size, *Align);
       AI->setUsedWithInAlloca(InAlloca);
       AI->setSwiftError(SwiftError);
       I = AI;

diff  --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 1c547ab7a1b2..59cbaf486a69 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -1959,7 +1959,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
       AllocaInst *AI;
       if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlignment() < PrefAlign &&
           DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2)
-        AI->setAlignment(MaybeAlign(PrefAlign));
+        AI->setAlignment(Align(PrefAlign));
       // Global variables can only be aligned if they are defined in this
       // object (i.e. they are uniquely initialized in this object), and
       // over-aligning global variables that have an explicit section is

diff  --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index aa71ceb795c7..9edc83e9c6c9 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1875,7 +1875,7 @@ bool IRTranslator::translateAlloca(const User &U,
       MIRBuilder.buildConstant(IntPtrTy, ~(uint64_t)(StackAlign.value() - 1));
   auto AlignedAlloc = MIRBuilder.buildAnd(IntPtrTy, AllocAdd, AlignCst);
 
-  Align Alignment = max(AI.getAlign(), DL->getPrefTypeAlign(Ty));
+  Align Alignment = std::max(AI.getAlign(), DL->getPrefTypeAlign(Ty));
   if (Alignment <= StackAlign)
     Alignment = Align(1);
   MIRBuilder.buildDynStackAlloc(getOrCreateVReg(AI), AlignedAlloc, Alignment);

diff  --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index c4821254f897..4f46f1990531 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -139,7 +139,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
         // or the preferred alignment of the type if none is specified.
         //
         // (Unspecified alignment on allocas will be going away soon.)
-        Align SpecifiedAlign = AI->getAlign() ? *AI->getAlign() : TyPrefAlign;
+        Align SpecifiedAlign = AI->getAlign();
 
         // If the preferred alignment of the type is higher than the specified
         // alignment of the alloca, promote the alignment, as long as it doesn't

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 00f5a1eed8e0..bac0f09cbed8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3890,7 +3890,7 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   auto &DL = DAG.getDataLayout();
   uint64_t TySize = DL.getTypeAllocSize(Ty);
-  MaybeAlign Alignment = max(DL.getPrefTypeAlign(Ty), I.getAlign());
+  MaybeAlign Alignment = std::max(DL.getPrefTypeAlign(Ty), I.getAlign());
 
   SDValue AllocSize = getValue(I.getArraySize());
 
@@ -9507,8 +9507,7 @@ static void tryToElideArgumentCopy(
                   "object size\n");
     return;
   }
-  Align RequiredAlignment = AI->getAlign().getValueOr(
-      FuncInfo.MF->getDataLayout().getABITypeAlign(AI->getAllocatedType()));
+  Align RequiredAlignment = AI->getAlign();
   if (MFI.getObjectAlign(FixedIndex) < RequiredAlignment) {
     LLVM_DEBUG(dbgs() << "  argument copy elision failed: alignment of alloca "
                          "greater than stack argument alignment ("

diff  --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 35e4717700c2..dbaf62251ce3 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -2021,7 +2021,7 @@ void LLVMSetAlignment(LLVMValueRef V, unsigned Bytes) {
   if (GlobalObject *GV = dyn_cast<GlobalObject>(P))
     GV->setAlignment(MaybeAlign(Bytes));
   else if (AllocaInst *AI = dyn_cast<AllocaInst>(P))
-    AI->setAlignment(MaybeAlign(Bytes));
+    AI->setAlignment(Align(Bytes));
   else if (LoadInst *LI = dyn_cast<LoadInst>(P))
     LI->setAlignment(Align(Bytes));
   else if (StoreInst *SI = dyn_cast<StoreInst>(P))

diff  --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 1d20949f9696..7ddf25b99a6a 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -1246,6 +1246,15 @@ static Value *getAISize(LLVMContext &Context, Value *Amt) {
   return Amt;
 }
 
+Align computeAllocaDefaultAlign(Type *Ty, BasicBlock *BB) {
+  const DataLayout &DL = BB->getModule()->getDataLayout();
+  return DL.getPrefTypeAlign(Ty);
+}
+
+Align computeAllocaDefaultAlign(Type *Ty, Instruction *I) {
+  return computeAllocaDefaultAlign(Ty, I->getParent());
+}
+
 AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, const Twine &Name,
                        Instruction *InsertBefore)
   : AllocaInst(Ty, AddrSpace, /*ArraySize=*/nullptr, Name, InsertBefore) {}
@@ -1256,27 +1265,29 @@ AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, const Twine &Name,
 
 AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
                        const Twine &Name, Instruction *InsertBefore)
-    : AllocaInst(Ty, AddrSpace, ArraySize, /*Align=*/None, Name, InsertBefore) {
-}
+    : AllocaInst(Ty, AddrSpace, ArraySize,
+                 computeAllocaDefaultAlign(Ty, InsertBefore), Name,
+                 InsertBefore) {}
 
 AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
                        const Twine &Name, BasicBlock *InsertAtEnd)
-    : AllocaInst(Ty, AddrSpace, ArraySize, /*Align=*/None, Name, InsertAtEnd) {}
+    : AllocaInst(Ty, AddrSpace, ArraySize,
+                 computeAllocaDefaultAlign(Ty, InsertAtEnd), Name,
+                 InsertAtEnd) {}
 
 AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
-                       MaybeAlign Align, const Twine &Name,
+                       Align Align, const Twine &Name,
                        Instruction *InsertBefore)
     : UnaryInstruction(PointerType::get(Ty, AddrSpace), Alloca,
                        getAISize(Ty->getContext(), ArraySize), InsertBefore),
       AllocatedType(Ty) {
-  setAlignment(MaybeAlign(Align));
+  setAlignment(Align);
   assert(!Ty->isVoidTy() && "Cannot allocate void!");
   setName(Name);
 }
 
 AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
-                       MaybeAlign Align, const Twine &Name,
-                       BasicBlock *InsertAtEnd)
+                       Align Align, const Twine &Name, BasicBlock *InsertAtEnd)
     : UnaryInstruction(PointerType::get(Ty, AddrSpace), Alloca,
                        getAISize(Ty->getContext(), ArraySize), InsertAtEnd),
       AllocatedType(Ty) {
@@ -1285,16 +1296,12 @@ AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
   setName(Name);
 }
 
-void AllocaInst::setAlignment(MaybeAlign Align) {
-  assert((!Align || *Align <= MaximumAlignment) &&
+void AllocaInst::setAlignment(Align Align) {
+  assert(Align <= MaximumAlignment &&
          "Alignment is greater than MaximumAlignment!");
   setInstructionSubclassData((getSubclassDataFromInstruction() & ~31) |
                              encode(Align));
-  if (Align)
-    assert(getAlignment() == Align->value() &&
-           "Alignment representation error!");
-  else
-    assert(getAlignment() == 0 && "Alignment representation error!");
+  assert(getAlignment() == Align.value() && "Alignment representation error!");
 }
 
 bool AllocaInst::isArrayAllocation() const {
@@ -4240,7 +4247,7 @@ InsertValueInst *InsertValueInst::cloneImpl() const {
 AllocaInst *AllocaInst::cloneImpl() const {
   AllocaInst *Result =
       new AllocaInst(getAllocatedType(), getType()->getAddressSpace(),
-                     (Value *)getOperand(0), MaybeAlign(getAlignment()));
+                     getOperand(0), getAlign());
   Result->setUsedWithInAlloca(isUsedWithInAlloca());
   Result->setSwiftError(isSwiftError());
   return Result;

diff  --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index 8169c49285d1..42f6bfb1940e 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -484,7 +484,7 @@ void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) {
   auto *NewAI = new AllocaInst(
       TypeWithPadding, Info.AI->getType()->getAddressSpace(), nullptr, "", Info.AI);
   NewAI->takeName(Info.AI);
-  NewAI->setAlignment(MaybeAlign(Info.AI->getAlignment()));
+  NewAI->setAlignment(Info.AI->getAlign());
   NewAI->setUsedWithInAlloca(Info.AI->isUsedWithInAlloca());
   NewAI->setSwiftError(Info.AI->isSwiftError());
   NewAI->copyMetadata(*Info.AI);

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 86e437e68c09..956508e12227 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -1416,8 +1416,8 @@ AllocaInst* AMDGPULibCalls::insertAlloca(CallInst *UI, IRBuilder<> &B,
   B.SetInsertPoint(&*ItNew);
   AllocaInst *Alloc = B.CreateAlloca(RetType, 0,
     std::string(prefix) + UI->getName());
-  Alloc->setAlignment(MaybeAlign(
-      UCallee->getParent()->getDataLayout().getTypeAllocSize(RetType)));
+  Alloc->setAlignment(
+      Align(UCallee->getParent()->getDataLayout().getTypeAllocSize(RetType)));
   return Alloc;
 }
 

diff  --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index c3c5f6fbcba7..e60b5eeacdae 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -159,12 +159,14 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
   assert(PType && "Expecting pointer type in handleByValParam");
 
   Type *StructType = PType->getElementType();
-  unsigned AS = Func->getParent()->getDataLayout().getAllocaAddrSpace();
+  const DataLayout &DL = Func->getParent()->getDataLayout();
+  unsigned AS = DL.getAllocaAddrSpace();
   AllocaInst *AllocA = new AllocaInst(StructType, AS, Arg->getName(), FirstInst);
   // Set the alignment to alignment of the byval parameter. This is because,
   // later load/stores assume that alignment, and we are going to replace
   // the use of the byval parameter with this alloca instruction.
-  AllocA->setAlignment(MaybeAlign(Func->getParamAlignment(Arg->getArgNo())));
+  AllocA->setAlignment(Func->getParamAlign(Arg->getArgNo())
+                           .getValueOr(DL.getPrefTypeAlign(StructType)));
   Arg->replaceAllUsesWith(AllocA);
 
   Value *ArgInParam = new AddrSpaceCastInst(

diff  --git a/llvm/lib/Transforms/Coroutines/CoroElide.cpp b/llvm/lib/Transforms/Coroutines/CoroElide.cpp
index e89c890386b5..9d364b3097c1 100644
--- a/llvm/lib/Transforms/Coroutines/CoroElide.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroElide.cpp
@@ -34,8 +34,8 @@ struct Lowerer : coro::LowererBase {
 
   Lowerer(Module &M) : LowererBase(M) {}
 
-  void elideHeapAllocations(Function *F, uint64_t FrameSize,
-                            MaybeAlign FrameAlign, AAResults &AA);
+  void elideHeapAllocations(Function *F, uint64_t FrameSize, Align FrameAlign,
+                            AAResults &AA);
   bool shouldElide(Function *F, DominatorTree &DT) const;
   void collectPostSplitCoroIds(Function *F);
   bool processCoroId(CoroIdInst *, AAResults &AA, DominatorTree &DT);
@@ -95,7 +95,7 @@ static void removeTailCallAttribute(AllocaInst *Frame, AAResults &AA) {
 
 // Given a resume function @f.resume(%f.frame* %frame), returns the size
 // and expected alignment of %f.frame type.
-static std::pair<uint64_t, MaybeAlign> getFrameLayout(Function *Resume) {
+static std::pair<uint64_t, Align> getFrameLayout(Function *Resume) {
   // Prefer to pull information from the function attributes.
   auto Size = Resume->getParamDereferenceableBytes(0);
   auto Align = Resume->getParamAlign(0);
@@ -109,7 +109,7 @@ static std::pair<uint64_t, MaybeAlign> getFrameLayout(Function *Resume) {
     if (!Align) Align = DL.getABITypeAlign(FrameTy);
   }
 
-  return std::make_pair(Size, Align);
+  return std::make_pair(Size, *Align);
 }
 
 // Finds first non alloca instruction in the entry block of a function.
@@ -123,7 +123,7 @@ static Instruction *getFirstNonAllocaInTheEntryBlock(Function *F) {
 // To elide heap allocations we need to suppress code blocks guarded by
 // llvm.coro.alloc and llvm.coro.free instructions.
 void Lowerer::elideHeapAllocations(Function *F, uint64_t FrameSize,
-                                   MaybeAlign FrameAlign, AAResults &AA) {
+                                   Align FrameAlign, AAResults &AA) {
   LLVMContext &C = F->getContext();
   auto *InsertPt =
       getFirstNonAllocaInTheEntryBlock(CoroIds.front()->getFunction());

diff  --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 204a4697b705..a628c48cb5ca 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -1266,7 +1266,7 @@ static void lowerLocalAllocas(ArrayRef<CoroAllocaAllocInst*> LocalAllocas,
 
     // Allocate memory.
     auto Alloca = Builder.CreateAlloca(Builder.getInt8Ty(), AI->getSize());
-    Alloca->setAlignment(MaybeAlign(AI->getAlignment()));
+    Alloca->setAlignment(Align(AI->getAlignment()));
 
     for (auto U : AI->users()) {
       // Replace gets with the allocation.

diff  --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index 05aa497dc0e3..3f7656d90a8b 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -385,9 +385,10 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
 
       // Just add all the struct element types.
       Type *AgTy = cast<PointerType>(I->getType())->getElementType();
-      Value *TheAlloca =
-          new AllocaInst(AgTy, DL.getAllocaAddrSpace(), nullptr,
-                         MaybeAlign(I->getParamAlignment()), "", InsertPt);
+      Value *TheAlloca = new AllocaInst(
+          AgTy, DL.getAllocaAddrSpace(), nullptr,
+          I->getParamAlign().getValueOr(DL.getPrefTypeAlign(AgTy)), "",
+          InsertPt);
       StructType *STy = cast<StructType>(AgTy);
       Value *Idxs[2] = {ConstantInt::get(Type::getInt32Ty(F->getContext()), 0),
                         nullptr};

diff  --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index e270b822ecc6..3af6d0898f96 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -4724,7 +4724,7 @@ struct AAHeapToStackImpl : public AAHeapToStack {
       LLVM_DEBUG(dbgs() << "H2S: Removing malloc call: " << *MallocCall
                         << "\n");
 
-      MaybeAlign Alignment;
+      Align Alignment;
       Constant *Size;
       if (isCallocLikeFn(MallocCall, TLI)) {
         auto *Num = cast<ConstantInt>(MallocCall->getOperand(0));
@@ -4736,7 +4736,8 @@ struct AAHeapToStackImpl : public AAHeapToStack {
         Size = cast<ConstantInt>(MallocCall->getOperand(1));
         Alignment = MaybeAlign(cast<ConstantInt>(MallocCall->getOperand(0))
                                    ->getValue()
-                                   .getZExtValue());
+                                   .getZExtValue())
+                        .valueOrOne();
       } else {
         Size = cast<ConstantInt>(MallocCall->getOperand(0));
       }

diff  --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp
index 35a16cb1c7b6..770ca2ea9130 100644
--- a/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -229,7 +229,7 @@ static void mergeInlinedArrayAllocas(Function *Caller, InlineFunctionInfo &IFI,
         }
 
         if (Align1 > Align2)
-          AvailableAlloca->setAlignment(MaybeAlign(AI->getAlignment()));
+          AvailableAlloca->setAlignment(AI->getAlign());
       }
 
       AI->eraseFromParent();

diff  --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index c5b0956232b2..16d94d6946b1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -141,7 +141,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   }
 
   AllocaInst *New = Builder.CreateAlloca(CastElTy, Amt);
-  New->setAlignment(MaybeAlign(AI.getAlignment()));
+  New->setAlignment(AI.getAlign());
   New->takeName(&AI);
   New->setUsedWithInAlloca(AI.isUsedWithInAlloca());
 

diff  --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 048867c63d00..8bcffcfcae9f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -181,7 +181,7 @@ static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) {
     if (C->getValue().getActiveBits() <= 64) {
       Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
       AllocaInst *New = IC.Builder.CreateAlloca(NewTy, nullptr, AI.getName());
-      New->setAlignment(MaybeAlign(AI.getAlignment()));
+      New->setAlignment(AI.getAlign());
 
       // Scan to the end of the allocation instructions, to skip over a block of
       // allocas if possible...also skip interleaved debug info
@@ -327,11 +327,6 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
     return I;
 
   if (AI.getAllocatedType()->isSized()) {
-    // If the alignment is 0 (unspecified), assign it the preferred alignment.
-    if (AI.getAlignment() == 0)
-      AI.setAlignment(
-          MaybeAlign(DL.getPrefTypeAlignment(AI.getAllocatedType())));
-
     // Move all alloca's of zero byte objects to the entry block and merge them
     // together.  Note that we only do this for alloca's, because malloc should
     // allocate and return a unique pointer, even for a zero byte allocation.
@@ -358,16 +353,10 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
           return &AI;
         }
 
-        // If the alignment of the entry block alloca is 0 (unspecified),
-        // assign it the preferred alignment.
-        if (EntryAI->getAlignment() == 0)
-          EntryAI->setAlignment(
-              MaybeAlign(DL.getPrefTypeAlignment(EntryAI->getAllocatedType())));
         // Replace this zero-sized alloca with the one at the start of the entry
         // block after ensuring that the address will be aligned enough for both
         // types.
-        const MaybeAlign MaxAlign(
-            std::max(EntryAI->getAlignment(), AI.getAlignment()));
+        const Align MaxAlign = std::max(EntryAI->getAlign(), AI.getAlign());
         EntryAI->setAlignment(MaxAlign);
         if (AI.getType() != EntryAI->getType())
           return new BitCastInst(EntryAI, AI.getType());

diff  --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 1d679ffe0b60..0fbc4d51a63d 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -2944,7 +2944,7 @@ Value *FunctionStackPoisoner::createAllocaForLayout(
   }
   assert((ClRealignStack & (ClRealignStack - 1)) == 0);
   size_t FrameAlignment = std::max(L.FrameAlignment, (size_t)ClRealignStack);
-  Alloca->setAlignment(MaybeAlign(FrameAlignment));
+  Alloca->setAlignment(Align(FrameAlignment));
   return IRB.CreatePointerCast(Alloca, IntptrTy);
 }
 
@@ -3329,7 +3329,7 @@ void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size,
 void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) {
   IRBuilder<> IRB(AI);
 
-  const unsigned Align = std::max(kAllocaRzSize, AI->getAlignment());
+  const unsigned Alignment = std::max(kAllocaRzSize, AI->getAlignment());
   const uint64_t AllocaRedzoneMask = kAllocaRzSize - 1;
 
   Value *Zero = Constant::getNullValue(IntptrTy);
@@ -3356,21 +3356,21 @@ void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) {
   Value *Cond = IRB.CreateICmpNE(Misalign, AllocaRzSize);
   Value *PartialPadding = IRB.CreateSelect(Cond, Misalign, Zero);
 
-  // AdditionalChunkSize = Align + PartialPadding + kAllocaRzSize
-  // Align is added to locate left redzone, PartialPadding for possible
+  // AdditionalChunkSize = Alignment + PartialPadding + kAllocaRzSize
+  // Alignment is added to locate left redzone, PartialPadding for possible
   // partial redzone and kAllocaRzSize for right redzone respectively.
   Value *AdditionalChunkSize = IRB.CreateAdd(
-      ConstantInt::get(IntptrTy, Align + kAllocaRzSize), PartialPadding);
+      ConstantInt::get(IntptrTy, Alignment + kAllocaRzSize), PartialPadding);
 
   Value *NewSize = IRB.CreateAdd(OldSize, AdditionalChunkSize);
 
-  // Insert new alloca with new NewSize and Align params.
+  // Insert new alloca with new NewSize and Alignment params.
   AllocaInst *NewAlloca = IRB.CreateAlloca(IRB.getInt8Ty(), NewSize);
-  NewAlloca->setAlignment(MaybeAlign(Align));
+  NewAlloca->setAlignment(Align(Alignment));
 
-  // NewAddress = Address + Align
+  // NewAddress = Address + Alignment
   Value *NewAddress = IRB.CreateAdd(IRB.CreatePtrToInt(NewAlloca, IntptrTy),
-                                    ConstantInt::get(IntptrTy, Align));
+                                    ConstantInt::get(IntptrTy, Alignment));
 
   // Insert __asan_alloca_poison call for new created alloca.
   IRB.CreateCall(AsanAllocaPoisonFunc, {NewAddress, OldSize});

diff  --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 0054826f2e5b..42882f82aa27 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1159,7 +1159,7 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
     uint64_t Size = getAllocaSizeInBytes(*AI);
     uint64_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
     AI->setAlignment(
-        MaybeAlign(std::max(AI->getAlignment(), Mapping.getObjectAlignment())));
+        Align(std::max(AI->getAlignment(), Mapping.getObjectAlignment())));
     if (Size != AlignedSize) {
       Type *AllocatedType = AI->getAllocatedType();
       if (AI->isArrayAllocation()) {
@@ -1172,7 +1172,7 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
       auto *NewAI = new AllocaInst(
           TypeWithPadding, AI->getType()->getAddressSpace(), nullptr, "", AI);
       NewAI->takeName(AI);
-      NewAI->setAlignment(MaybeAlign(AI->getAlignment()));
+      NewAI->setAlignment(AI->getAlign());
       NewAI->setUsedWithInAlloca(AI->isUsedWithInAlloca());
       NewAI->setSwiftError(AI->isSwiftError());
       NewAI->copyMetadata(*AI);

diff  --git a/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
index a091bf6d806b..9c4cdf2feb56 100644
--- a/llvm/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
@@ -898,9 +898,8 @@ class GVNHoist {
                                               cast<StoreInst>(I)->getAlign()));
       ++NumStoresRemoved;
     } else if (auto *ReplacementAlloca = dyn_cast<AllocaInst>(Repl)) {
-      ReplacementAlloca->setAlignment(
-          MaybeAlign(std::max(ReplacementAlloca->getAlignment(),
-                              cast<AllocaInst>(I)->getAlignment())));
+      ReplacementAlloca->setAlignment(std::max(
+          ReplacementAlloca->getAlign(), cast<AllocaInst>(I)->getAlign()));
     } else if (isa<CallInst>(Repl)) {
       ++NumCallsRemoved;
     }

diff  --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index a2e13a959ddf..ff02926aa53b 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -636,7 +636,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
             LI, SI->getPointerOperand()->stripPointerCasts(),
             LI->getPointerOperand()->stripPointerCasts(),
             DL.getTypeStoreSize(SI->getOperand(0)->getType()),
-            findCommonAlignment(DL, SI, LI).value(), C);
+            findCommonAlignment(DL, SI, LI), C);
         if (changed) {
           MD->removeInstruction(SI);
           SI->eraseFromParent();
@@ -707,7 +707,7 @@ bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
 /// the call write its result directly into the destination of the memcpy.
 bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
                                          Value *cpySrc, uint64_t cpyLen,
-                                         unsigned cpyAlign, CallInst *C) {
+                                         Align cpyAlign, CallInst *C) {
   // The general transformation to keep in mind is
   //
   //   call @func(..., src, ...)
@@ -785,9 +785,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
   }
 
   // Check that dest points to memory that is at least as aligned as src.
-  unsigned srcAlign = srcAlloca->getAlignment();
-  if (!srcAlign)
-    srcAlign = DL.getABITypeAlignment(srcAlloca->getAllocatedType());
+  Align srcAlign = srcAlloca->getAlign();
   bool isDestSufficientlyAligned = srcAlign <= cpyAlign;
   // If dest is not aligned enough and we can't increase its alignment then
   // bail out.
@@ -882,7 +880,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
   // If the destination wasn't sufficiently aligned then increase its alignment.
   if (!isDestSufficientlyAligned) {
     assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!");
-    cast<AllocaInst>(cpyDest)->setAlignment(MaybeAlign(srcAlign));
+    cast<AllocaInst>(cpyDest)->setAlignment(srcAlign);
   }
 
   // Drop any cached information about the call, because we may have changed
@@ -1167,10 +1165,10 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) {
     if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
       // FIXME: Can we pass in either of dest/src alignment here instead
       // of conservatively taking the minimum?
-      unsigned Align = MinAlign(M->getDestAlignment(), M->getSourceAlignment());
+      Align Alignment = std::min(M->getDestAlign().valueOrOne(),
+                                 M->getSourceAlign().valueOrOne());
       if (performCallSlotOptzn(M, M->getDest(), M->getSource(),
-                               CopySize->getZExtValue(), Align,
-                               C)) {
+                               CopySize->getZExtValue(), Alignment, C)) {
         MD->removeInstruction(M);
         M->eraseFromParent();
         return true;

diff  --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 8b95bca89e11..01b706332e7a 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -4198,7 +4198,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
     const bool IsUnconstrained = Alignment <= DL.getABITypeAlignment(SliceTy);
     NewAI = new AllocaInst(
         SliceTy, AI.getType()->getAddressSpace(), nullptr,
-        IsUnconstrained ? MaybeAlign() : Alignment,
+        IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment,
         AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
     // Copy the old AI debug location over to the new one.
     NewAI->setDebugLoc(AI.getDebugLoc());

diff  --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 6d6e74c9d759..ae4ef97b2fd0 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -1181,7 +1181,7 @@ static Align enforceKnownAlignment(Value *V, Align Alignment, Align PrefAlign,
     // stripPointerCasts recurses through infinite layers of bitcasts,
     // while computeKnownBits is not allowed to traverse more than 6
     // levels.
-    Alignment = max(AI->getAlign(), Alignment);
+    Alignment = std::max(AI->getAlign(), Alignment);
     if (PrefAlign <= Alignment)
       return Alignment;
 

diff  --git a/llvm/test/Assembler/alloca-addrspace-elems.ll b/llvm/test/Assembler/alloca-addrspace-elems.ll
index 8c02760fe95f..40f7a051003c 100644
--- a/llvm/test/Assembler/alloca-addrspace-elems.ll
+++ b/llvm/test/Assembler/alloca-addrspace-elems.ll
@@ -4,11 +4,11 @@ target datalayout = "A5"
 ; CHECK: target datalayout = "A5"
 
 
-; CHECK: %alloca_array_no_align = alloca i32, i32 9, addrspace(5)
+; CHECK: %alloca_array_no_align = alloca i32, i32 9, align 4, addrspace(5)
 ; CHECK-NEXT: %alloca_array_align4 = alloca i32, i32 9, align 4, addrspace(5)
-; CHECK-NEXT: %alloca_array_no_align_metadata = alloca i32, i32 9, addrspace(5), !foo !0
+; CHECK-NEXT: %alloca_array_no_align_metadata = alloca i32, i32 9, align 4, addrspace(5), !foo !0
 ; CHECK-NEXT: %alloca_array_align4_metadata = alloca i32, i32 9, align 4, addrspace(5), !foo !0
-; CHECK-NEXT: %alloca_inalloca_array_no_align = alloca inalloca i32, i32 9, addrspace(5)
+; CHECK-NEXT: %alloca_inalloca_array_no_align = alloca inalloca i32, i32 9, align 4, addrspace(5)
 ; CHECK-NEXT: %alloca_inalloca_array_align4_metadata = alloca inalloca i32, i32 9, align 4, addrspace(5), !foo !0
 
 define void @use_alloca() {

diff  --git a/llvm/test/Assembler/alloca-addrspace0.ll b/llvm/test/Assembler/alloca-addrspace0.ll
index 09b7a323f62f..1ad4f14a08cf 100644
--- a/llvm/test/Assembler/alloca-addrspace0.ll
+++ b/llvm/test/Assembler/alloca-addrspace0.ll
@@ -4,11 +4,11 @@ target datalayout = "A0"
 ; CHECK: target datalayout = "A0"
 
 
-; CHECK: %alloca_scalar_no_align = alloca i32
+; CHECK: %alloca_scalar_no_align = alloca i32, align 4
 ; CHECK-NEXT: %alloca_scalar_align4 = alloca i32, align 4
-; CHECK-NEXT: %alloca_scalar_no_align_metadata = alloca i32, !foo !0
+; CHECK-NEXT: %alloca_scalar_no_align_metadata = alloca i32, align 4, !foo !0
 ; CHECK-NEXT: %alloca_scalar_align4_metadata = alloca i32, align 4, !foo !0
-; CHECK-NEXT: %alloca_inalloca_scalar_no_align = alloca inalloca i32
+; CHECK-NEXT: %alloca_inalloca_scalar_no_align = alloca inalloca i32, align 4
 ; CHECK-NEXT: %alloca_inalloca_scalar_align4_metadata = alloca inalloca i32, align 4, !foo !0
 define void @use_alloca() {
   %alloca_scalar_no_align = alloca i32, addrspace(0)

diff  --git a/llvm/test/Assembler/block-labels.ll b/llvm/test/Assembler/block-labels.ll
index 9ab55e00c2d5..af9f82459bfc 100644
--- a/llvm/test/Assembler/block-labels.ll
+++ b/llvm/test/Assembler/block-labels.ll
@@ -21,7 +21,7 @@ $N:
 }
 
 ; CHECK-LABEL: define i32 @test1(i32 %X) {
-; CHECK-NEXT:   %1 = alloca i32
+; CHECK-NEXT:   %1 = alloca i32, align 4
 ; CHECK-NEXT:   br label %2
 ; CHECK:      2:       ; preds = %0
 ; CHECK-NEXT:   br label %3

diff  --git a/llvm/test/Assembler/datalayout-alloca-addrspace-mismatch-0.ll b/llvm/test/Assembler/datalayout-alloca-addrspace-mismatch-0.ll
index 828aa133c762..3ab5890a1441 100644
--- a/llvm/test/Assembler/datalayout-alloca-addrspace-mismatch-0.ll
+++ b/llvm/test/Assembler/datalayout-alloca-addrspace-mismatch-0.ll
@@ -3,7 +3,7 @@
 target datalayout = "A1"
 
 ; CHECK: Allocation instruction pointer not in the stack address space!
-; CHECK-NEXT:  %alloca_scalar_no_align = alloca i32, addrspace(2)
+; CHECK-NEXT:  %alloca_scalar_no_align = alloca i32, align 4, addrspace(2)
 
 define void @use_alloca() {
   %alloca_scalar_no_align = alloca i32, addrspace(2)

diff  --git a/llvm/test/Assembler/datalayout-alloca-addrspace.ll b/llvm/test/Assembler/datalayout-alloca-addrspace.ll
index 578b7ef0b37d..6e95ef15dd50 100644
--- a/llvm/test/Assembler/datalayout-alloca-addrspace.ll
+++ b/llvm/test/Assembler/datalayout-alloca-addrspace.ll
@@ -3,11 +3,11 @@
 target datalayout = "A1"
 ; CHECK: target datalayout = "A1"
 
-; CHECK: %alloca_scalar_no_align = alloca i32, addrspace(1)
+; CHECK: %alloca_scalar_no_align = alloca i32, align 4, addrspace(1)
 ; CHECK-NEXT: %alloca_scalar_align4 = alloca i32, align 4, addrspace(1)
-; CHECK-NEXT: %alloca_scalar_no_align_metadata = alloca i32, addrspace(1), !foo !0
+; CHECK-NEXT: %alloca_scalar_no_align_metadata = alloca i32, align 4, addrspace(1), !foo !0
 ; CHECK-NEXT: %alloca_scalar_align4_metadata = alloca i32, align 4, addrspace(1), !foo !0
-; CHECK-NEXT: %alloca_inalloca_scalar_no_align = alloca inalloca i32, addrspace(1)
+; CHECK-NEXT: %alloca_inalloca_scalar_no_align = alloca inalloca i32, align 4, addrspace(1)
 ; CHECK-NEXT: %alloca_inalloca_scalar_align4_metadata = alloca inalloca i32, align 4, addrspace(1), !foo !0
 define void @use_alloca() {
   %alloca_scalar_no_align = alloca i32, addrspace(1)

diff  --git a/llvm/test/Assembler/drop-debug-info-nonzero-alloca.ll b/llvm/test/Assembler/drop-debug-info-nonzero-alloca.ll
index 3e2aefb0e11b..01446dfb6535 100644
--- a/llvm/test/Assembler/drop-debug-info-nonzero-alloca.ll
+++ b/llvm/test/Assembler/drop-debug-info-nonzero-alloca.ll
@@ -6,7 +6,7 @@
 define void @foo() {
 entry:
 ; DIS: target datalayout = "A5"
-; DIS: %tmp = alloca i32, addrspace(5)
+; DIS: %tmp = alloca i32, align 4, addrspace(5)
   %tmp = alloca i32, addrspace(5)
   call void @llvm.dbg.value(
       metadata i8* undef,

diff  --git a/llvm/test/CodeGen/AMDGPU/alloca.ll b/llvm/test/CodeGen/AMDGPU/alloca.ll
index 58d8d2f40802..0e82bb97d5dc 100644
--- a/llvm/test/CodeGen/AMDGPU/alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/alloca.ll
@@ -4,7 +4,7 @@
 ; RUN: opt -data-layout=A5 -S < %s
 ; RUN: llvm-as -data-layout=A5 < %s | opt -S
 
-; CHECK: %tmp = alloca i32, addrspace(5)
+; CHECK: %tmp = alloca i32, align 4, addrspace(5)
 define amdgpu_kernel void @test() {
   %tmp = alloca i32, addrspace(5)
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll b/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll
index 3a70a965163f..9daaeee89cea 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll
@@ -246,7 +246,7 @@ define amdgpu_kernel void @kern_i24(i24 %arg0) {
 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
 ; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
 ; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
-; HSA-NEXT:    store i24 [[TMP2]], i24 addrspace(1)* undef
+; HSA-NEXT:    store i24 [[TMP2]], i24 addrspace(1)* undef, align 4
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_i24(
@@ -255,7 +255,7 @@ define amdgpu_kernel void @kern_i24(i24 %arg0) {
 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
 ; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
 ; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
-; MESA-NEXT:    store i24 [[TMP2]], i24 addrspace(1)* undef
+; MESA-NEXT:    store i24 [[TMP2]], i24 addrspace(1)* undef, align 4
 ; MESA-NEXT:    ret void
 ;
   store i24 %arg0, i24 addrspace(1)* undef
@@ -268,7 +268,7 @@ define amdgpu_kernel void @kern_i32(i32 %arg0) {
 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_KERNARG_SEGMENT]], i64 0
 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
 ; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
-; HSA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
+; HSA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef, align 4
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_i32(
@@ -276,7 +276,7 @@ define amdgpu_kernel void @kern_i32(i32 %arg0) {
 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_KERNARG_SEGMENT]], i64 36
 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
 ; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
-; MESA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
+; MESA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef, align 4
 ; MESA-NEXT:    ret void
 ;
   store i32 %arg0, i32 addrspace(1)* undef
@@ -289,7 +289,7 @@ define amdgpu_kernel void @kern_f32(float %arg0) {
 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F32_KERNARG_SEGMENT]], i64 0
 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to float addrspace(4)*
 ; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load float, float addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
-; HSA-NEXT:    store float [[ARG0_LOAD]], float addrspace(1)* undef
+; HSA-NEXT:    store float [[ARG0_LOAD]], float addrspace(1)* undef, align 4
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_f32(
@@ -297,7 +297,7 @@ define amdgpu_kernel void @kern_f32(float %arg0) {
 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F32_KERNARG_SEGMENT]], i64 36
 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to float addrspace(4)*
 ; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load float, float addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
-; MESA-NEXT:    store float [[ARG0_LOAD]], float addrspace(1)* undef
+; MESA-NEXT:    store float [[ARG0_LOAD]], float addrspace(1)* undef, align 4
 ; MESA-NEXT:    ret void
 ;
   store float %arg0, float addrspace(1)* undef
@@ -333,7 +333,7 @@ define amdgpu_kernel void @kern_v8i32(<8 x i32> %arg) #0 {
 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I32_KERNARG_SEGMENT]], i64 0
 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i32> addrspace(4)*
 ; HSA-NEXT:    [[ARG_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
-; HSA-NEXT:    store <8 x i32> [[ARG_LOAD]], <8 x i32> addrspace(1)* undef
+; HSA-NEXT:    store <8 x i32> [[ARG_LOAD]], <8 x i32> addrspace(1)* undef, align 32
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_v8i32(
@@ -341,7 +341,7 @@ define amdgpu_kernel void @kern_v8i32(<8 x i32> %arg) #0 {
 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I32_KERNARG_SEGMENT]], i64 36
 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i32> addrspace(4)*
 ; MESA-NEXT:    [[ARG_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
-; MESA-NEXT:    store <8 x i32> [[ARG_LOAD]], <8 x i32> addrspace(1)* undef
+; MESA-NEXT:    store <8 x i32> [[ARG_LOAD]], <8 x i32> addrspace(1)* undef, align 32
 ; MESA-NEXT:    ret void
 ;
   store <8 x i32> %arg, <8 x i32> addrspace(1)* undef
@@ -354,7 +354,7 @@ define amdgpu_kernel void @kern_v8i64(<8 x i64> %arg) #0 {
 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I64_KERNARG_SEGMENT]], i64 0
 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i64> addrspace(4)*
 ; HSA-NEXT:    [[ARG_LOAD:%.*]] = load <8 x i64>, <8 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
-; HSA-NEXT:    store <8 x i64> [[ARG_LOAD]], <8 x i64> addrspace(1)* undef
+; HSA-NEXT:    store <8 x i64> [[ARG_LOAD]], <8 x i64> addrspace(1)* undef, align 64
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_v8i64(
@@ -362,7 +362,7 @@ define amdgpu_kernel void @kern_v8i64(<8 x i64> %arg) #0 {
 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I64_KERNARG_SEGMENT]], i64 36
 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i64> addrspace(4)*
 ; MESA-NEXT:    [[ARG_LOAD:%.*]] = load <8 x i64>, <8 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
-; MESA-NEXT:    store <8 x i64> [[ARG_LOAD]], <8 x i64> addrspace(1)* undef
+; MESA-NEXT:    store <8 x i64> [[ARG_LOAD]], <8 x i64> addrspace(1)* undef, align 64
 ; MESA-NEXT:    ret void
 ;
   store <8 x i64> %arg, <8 x i64> addrspace(1)* undef
@@ -375,7 +375,7 @@ define amdgpu_kernel void @kern_v16i64(<16 x i64> %arg) #0 {
 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V16I64_KERNARG_SEGMENT]], i64 0
 ; HSA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <16 x i64> addrspace(4)*
 ; HSA-NEXT:    [[ARG_LOAD:%.*]] = load <16 x i64>, <16 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
-; HSA-NEXT:    store <16 x i64> [[ARG_LOAD]], <16 x i64> addrspace(1)* undef
+; HSA-NEXT:    store <16 x i64> [[ARG_LOAD]], <16 x i64> addrspace(1)* undef, align 128
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_v16i64(
@@ -383,7 +383,7 @@ define amdgpu_kernel void @kern_v16i64(<16 x i64> %arg) #0 {
 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V16I64_KERNARG_SEGMENT]], i64 36
 ; MESA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <16 x i64> addrspace(4)*
 ; MESA-NEXT:    [[ARG_LOAD:%.*]] = load <16 x i64>, <16 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
-; MESA-NEXT:    store <16 x i64> [[ARG_LOAD]], <16 x i64> addrspace(1)* undef
+; MESA-NEXT:    store <16 x i64> [[ARG_LOAD]], <16 x i64> addrspace(1)* undef, align 128
 ; MESA-NEXT:    ret void
 ;
   store <16 x i64> %arg, <16 x i64> addrspace(1)* undef
@@ -400,7 +400,7 @@ define amdgpu_kernel void @kern_i32_v3i32(i32 %arg0, <3 x i32> %arg1) {
 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
 ; HSA-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
 ; HSA-NEXT:    [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
-; HSA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
+; HSA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef, align 4
 ; HSA-NEXT:    store <3 x i32> [[ARG1_LOAD]], <3 x i32> addrspace(1)* undef, align 4
 ; HSA-NEXT:    ret void
 ;
@@ -413,7 +413,7 @@ define amdgpu_kernel void @kern_i32_v3i32(i32 %arg0, <3 x i32> %arg1) {
 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
 ; MESA-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
 ; MESA-NEXT:    [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
-; MESA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
+; MESA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef, align 4
 ; MESA-NEXT:    store <3 x i32> [[ARG1_LOAD]], <3 x i32> addrspace(1)* undef, align 4
 ; MESA-NEXT:    ret void
 ;
@@ -431,7 +431,7 @@ define amdgpu_kernel void @kern_struct_a(%struct.a %arg0) {
 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_A_KERNARG_SEGMENT]], i64 0
 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_A:%.*]] addrspace(4)*
 ; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_A]], [[STRUCT_A]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
-; HSA-NEXT:    store [[STRUCT_A]] %arg0.load, [[STRUCT_A]] addrspace(1)* undef
+; HSA-NEXT:    store [[STRUCT_A]] %arg0.load, [[STRUCT_A]] addrspace(1)* undef, align 4
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_struct_a(
@@ -439,7 +439,7 @@ define amdgpu_kernel void @kern_struct_a(%struct.a %arg0) {
 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_A_KERNARG_SEGMENT]], i64 36
 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_A:%.*]] addrspace(4)*
 ; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_A]], [[STRUCT_A]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
-; MESA-NEXT:    store [[STRUCT_A]] %arg0.load, [[STRUCT_A]] addrspace(1)* undef
+; MESA-NEXT:    store [[STRUCT_A]] %arg0.load, [[STRUCT_A]] addrspace(1)* undef, align 4
 ; MESA-NEXT:    ret void
 ;
   store %struct.a %arg0, %struct.a addrspace(1)* undef
@@ -452,7 +452,7 @@ define amdgpu_kernel void @kern_struct_b_packed(%struct.b.packed %arg0) #0 {
 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]], i64 0
 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_B_PACKED:%.*]] addrspace(4)*
 ; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED]], [[STRUCT_B_PACKED]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
-; HSA-NEXT:    store [[STRUCT_B_PACKED]] %arg0.load, [[STRUCT_B_PACKED]] addrspace(1)* undef
+; HSA-NEXT:    store [[STRUCT_B_PACKED]] %arg0.load, [[STRUCT_B_PACKED]] addrspace(1)* undef, align 16
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_struct_b_packed(
@@ -460,7 +460,7 @@ define amdgpu_kernel void @kern_struct_b_packed(%struct.b.packed %arg0) #0 {
 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]], i64 36
 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_B_PACKED:%.*]] addrspace(4)*
 ; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED]], [[STRUCT_B_PACKED]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
-; MESA-NEXT:    store [[STRUCT_B_PACKED]] %arg0.load, [[STRUCT_B_PACKED]] addrspace(1)* undef
+; MESA-NEXT:    store [[STRUCT_B_PACKED]] %arg0.load, [[STRUCT_B_PACKED]] addrspace(1)* undef, align 16
 ; MESA-NEXT:    ret void
 ;
   store %struct.b.packed %arg0, %struct.b.packed addrspace(1)* undef
@@ -473,7 +473,7 @@ define amdgpu_kernel void @kern_implicit_arg_num_bytes(i32 %arg0) #1 {
 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]], i64 0
 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
 ; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
-; HSA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
+; HSA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef, align 4
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_implicit_arg_num_bytes(
@@ -481,7 +481,7 @@ define amdgpu_kernel void @kern_implicit_arg_num_bytes(i32 %arg0) #1 {
 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]], i64 36
 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
 ; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
-; MESA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
+; MESA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef, align 4
 ; MESA-NEXT:    ret void
 ;
   store i32 %arg0, i32 addrspace(1)* undef
@@ -494,7 +494,7 @@ define amdgpu_kernel void @kernel_implicitarg_no_struct_align(<16 x i32>, i32 %a
 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT]], i64 64
 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)*
 ; HSA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
-; HSA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef
+; HSA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef, align 4
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kernel_implicitarg_no_struct_align(
@@ -502,7 +502,7 @@ define amdgpu_kernel void @kernel_implicitarg_no_struct_align(<16 x i32>, i32 %a
 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT]], i64 100
 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)*
 ; MESA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
-; MESA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef
+; MESA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef, align 4
 ; MESA-NEXT:    ret void
 ;
   store i32 %arg1, i32 addrspace(1)* undef
@@ -560,8 +560,8 @@ define amdgpu_kernel void @kern_realign_i8_i8(i8 %arg0, i8 %arg1) #0 {
 ; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
 ; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
 ; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
-; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef
-; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
+; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_realign_i8_i8(
@@ -575,8 +575,8 @@ define amdgpu_kernel void @kern_realign_i8_i8(i8 %arg0, i8 %arg1) #0 {
 ; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
 ; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
 ; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
-; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef
-; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
+; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
 ; MESA-NEXT:    ret void
 ;
   store volatile i8 %arg0, i8 addrspace(1)* undef
@@ -601,9 +601,9 @@ define amdgpu_kernel void @kern_realign_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2) #
 ; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
 ; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
 ; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
-; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef
-; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
-; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
+; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
+; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_realign_i8_i8_i8(
@@ -622,9 +622,9 @@ define amdgpu_kernel void @kern_realign_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2) #
 ; MESA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
 ; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
 ; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
-; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef
-; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
-; MESA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
+; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
+; MESA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
 ; MESA-NEXT:    ret void
 ;
   store volatile i8 %arg0, i8 addrspace(1)* undef
@@ -655,10 +655,10 @@ define amdgpu_kernel void @kern_realign_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2
 ; HSA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
 ; HSA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
 ; HSA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
-; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef
-; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
-; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef
-; HSA-NEXT:    store volatile i8 [[TMP11]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
+; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
+; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
+; HSA-NEXT:    store volatile i8 [[TMP11]], i8 addrspace(1)* undef, align 1
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_realign_i8_i8_i8_i8(
@@ -682,10 +682,10 @@ define amdgpu_kernel void @kern_realign_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2
 ; MESA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
 ; MESA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
 ; MESA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
-; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef
-; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
-; MESA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef
-; MESA-NEXT:    store volatile i8 [[TMP11]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
+; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
+; MESA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
+; MESA-NEXT:    store volatile i8 [[TMP11]], i8 addrspace(1)* undef, align 1
 ; MESA-NEXT:    ret void
 ;
   store volatile i8 %arg0, i8 addrspace(1)* undef
@@ -707,8 +707,8 @@ define amdgpu_kernel void @kern_realign_i8_v3i8(i8 %arg0, <3 x i8> %arg1) #0 {
 ; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
 ; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24
 ; HSA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP4]] to <3 x i8>
-; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef
-; HSA-NEXT:    store volatile <3 x i8> [[ARG1_LOAD]], <3 x i8> addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
+; HSA-NEXT:    store volatile <3 x i8> [[ARG1_LOAD]], <3 x i8> addrspace(1)* undef, align 4
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_realign_i8_v3i8(
@@ -722,8 +722,8 @@ define amdgpu_kernel void @kern_realign_i8_v3i8(i8 %arg0, <3 x i8> %arg1) #0 {
 ; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0
 ; MESA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24
 ; MESA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP4]] to <3 x i8>
-; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef
-; MESA-NEXT:    store volatile <3 x i8> [[ARG1_LOAD]], <3 x i8> addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
+; MESA-NEXT:    store volatile <3 x i8> [[ARG1_LOAD]], <3 x i8> addrspace(1)* undef, align 4
 ; MESA-NEXT:    ret void
 ;
   store volatile i8 %arg0, i8 addrspace(1)* undef
@@ -743,8 +743,8 @@ define amdgpu_kernel void @kern_realign_i8_i16(i8 %arg0, i16 %arg1) #0 {
 ; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
 ; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
 ; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
-; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef
-; HSA-NEXT:    store volatile i16 [[TMP5]], i16 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
+; HSA-NEXT:    store volatile i16 [[TMP5]], i16 addrspace(1)* undef, align 2
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_realign_i8_i16(
@@ -758,8 +758,8 @@ define amdgpu_kernel void @kern_realign_i8_i16(i8 %arg0, i16 %arg1) #0 {
 ; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
 ; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
 ; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
-; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef
-; MESA-NEXT:    store volatile i16 [[TMP5]], i16 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
+; MESA-NEXT:    store volatile i16 [[TMP5]], i16 addrspace(1)* undef, align 2
 ; MESA-NEXT:    ret void
 ;
   store volatile i8 %arg0, i8 addrspace(1)* undef
@@ -779,8 +779,8 @@ define amdgpu_kernel void @kern_realign_i1_i1(i1 %arg0, i1 %arg1) #0 {
 ; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
 ; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
 ; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
-; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef
-; HSA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef
+; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
+; HSA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef, align 1
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_realign_i1_i1(
@@ -794,8 +794,8 @@ define amdgpu_kernel void @kern_realign_i1_i1(i1 %arg0, i1 %arg1) #0 {
 ; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
 ; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
 ; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
-; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef
-; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef
+; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
+; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef, align 1
 ; MESA-NEXT:    ret void
 ;
   store volatile i1 %arg0, i1 addrspace(1)* undef
@@ -820,9 +820,9 @@ define amdgpu_kernel void @kern_realign_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2) #
 ; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
 ; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
 ; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
-; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef
-; HSA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef
-; HSA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef
+; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
+; HSA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef, align 1
+; HSA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef, align 1
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_realign_i1_i1_i1(
@@ -841,9 +841,9 @@ define amdgpu_kernel void @kern_realign_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2) #
 ; MESA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
 ; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
 ; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
-; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef
-; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef
-; MESA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef
+; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
+; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef, align 1
+; MESA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef, align 1
 ; MESA-NEXT:    ret void
 ;
   store volatile i1 %arg0, i1 addrspace(1)* undef
@@ -874,10 +874,10 @@ define amdgpu_kernel void @kern_realign_i1_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2
 ; HSA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
 ; HSA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
 ; HSA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i1
-; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef
-; HSA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef
-; HSA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef
-; HSA-NEXT:    store volatile i1 [[TMP11]], i1 addrspace(1)* undef
+; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
+; HSA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef, align 1
+; HSA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef, align 1
+; HSA-NEXT:    store volatile i1 [[TMP11]], i1 addrspace(1)* undef, align 1
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_realign_i1_i1_i1_i1(
@@ -901,10 +901,10 @@ define amdgpu_kernel void @kern_realign_i1_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2
 ; MESA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
 ; MESA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
 ; MESA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i1
-; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef
-; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef
-; MESA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef
-; MESA-NEXT:    store volatile i1 [[TMP11]], i1 addrspace(1)* undef
+; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
+; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef, align 1
+; MESA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef, align 1
+; MESA-NEXT:    store volatile i1 [[TMP11]], i1 addrspace(1)* undef, align 1
 ; MESA-NEXT:    ret void
 ;
   store volatile i1 %arg0, i1 addrspace(1)* undef
@@ -926,8 +926,8 @@ define amdgpu_kernel void @kern_realign_i1_v3i1(i1 %arg0, <3 x i1> %arg1) #0 {
 ; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
 ; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
 ; HSA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP4]] to <3 x i1>
-; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef
-; HSA-NEXT:    store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef
+; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
+; HSA-NEXT:    store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef, align 4
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_realign_i1_v3i1(
@@ -941,8 +941,8 @@ define amdgpu_kernel void @kern_realign_i1_v3i1(i1 %arg0, <3 x i1> %arg1) #0 {
 ; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0
 ; MESA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
 ; MESA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP4]] to <3 x i1>
-; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef
-; MESA-NEXT:    store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef
+; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
+; MESA-NEXT:    store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef, align 4
 ; MESA-NEXT:    ret void
 ;
   store volatile i1 %arg0, i1 addrspace(1)* undef
@@ -962,8 +962,8 @@ define amdgpu_kernel void @kern_realign_i1_i16(i1 %arg0, i16 %arg1) #0 {
 ; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
 ; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
 ; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
-; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef
-; HSA-NEXT:    store volatile i16 [[TMP5]], i16 addrspace(1)* undef
+; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
+; HSA-NEXT:    store volatile i16 [[TMP5]], i16 addrspace(1)* undef, align 2
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_realign_i1_i16(
@@ -977,8 +977,8 @@ define amdgpu_kernel void @kern_realign_i1_i16(i1 %arg0, i16 %arg1) #0 {
 ; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
 ; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
 ; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
-; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef
-; MESA-NEXT:    store volatile i16 [[TMP5]], i16 addrspace(1)* undef
+; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
+; MESA-NEXT:    store volatile i16 [[TMP5]], i16 addrspace(1)* undef, align 2
 ; MESA-NEXT:    ret void
 ;
   store volatile i1 %arg0, i1 addrspace(1)* undef
@@ -1023,13 +1023,13 @@ define amdgpu_kernel void @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(i8 %arg0, i8 %ar
 ; HSA-NEXT:    [[TMP18:%.*]] = load i32, i32 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
 ; HSA-NEXT:    [[TMP19:%.*]] = lshr i32 [[TMP18]], 24
 ; HSA-NEXT:    [[TMP20:%.*]] = trunc i32 [[TMP19]] to i8
-; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef
-; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
-; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef
-; HSA-NEXT:    store volatile i8 [[TMP11]], i8 addrspace(1)* undef
-; HSA-NEXT:    store volatile i8 [[TMP14]], i8 addrspace(1)* undef
-; HSA-NEXT:    store volatile i8 [[TMP17]], i8 addrspace(1)* undef
-; HSA-NEXT:    store volatile i8 [[TMP20]], i8 addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
+; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
+; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
+; HSA-NEXT:    store volatile i8 [[TMP11]], i8 addrspace(1)* undef, align 1
+; HSA-NEXT:    store volatile i8 [[TMP14]], i8 addrspace(1)* undef, align 1
+; HSA-NEXT:    store volatile i8 [[TMP17]], i8 addrspace(1)* undef, align 1
+; HSA-NEXT:    store volatile i8 [[TMP20]], i8 addrspace(1)* undef, align 1
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(
@@ -1068,13 +1068,13 @@ define amdgpu_kernel void @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(i8 %arg0, i8 %ar
 ; MESA-NEXT:    [[TMP18:%.*]] = load i32, i32 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0
 ; MESA-NEXT:    [[TMP19:%.*]] = lshr i32 [[TMP18]], 24
 ; MESA-NEXT:    [[TMP20:%.*]] = trunc i32 [[TMP19]] to i8
-; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef
-; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef
-; MESA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef
-; MESA-NEXT:    store volatile i8 [[TMP11]], i8 addrspace(1)* undef
-; MESA-NEXT:    store volatile i8 [[TMP14]], i8 addrspace(1)* undef
-; MESA-NEXT:    store volatile i8 [[TMP17]], i8 addrspace(1)* undef
-; MESA-NEXT:    store volatile i8 [[TMP20]], i8 addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
+; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
+; MESA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
+; MESA-NEXT:    store volatile i8 [[TMP11]], i8 addrspace(1)* undef, align 1
+; MESA-NEXT:    store volatile i8 [[TMP14]], i8 addrspace(1)* undef, align 1
+; MESA-NEXT:    store volatile i8 [[TMP17]], i8 addrspace(1)* undef, align 1
+; MESA-NEXT:    store volatile i8 [[TMP20]], i8 addrspace(1)* undef, align 1
 ; MESA-NEXT:    ret void
 ;
   store volatile i8 %arg0, i8 addrspace(1)* undef
@@ -1101,8 +1101,8 @@ define amdgpu_kernel void @kern_realign_f16_f16(half %arg0, half %arg1) #0 {
 ; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
 ; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
 ; HSA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP5]] to half
-; HSA-NEXT:    store volatile half [[ARG0_LOAD]], half addrspace(1)* undef
-; HSA-NEXT:    store volatile half [[ARG1_LOAD]], half addrspace(1)* undef
+; HSA-NEXT:    store volatile half [[ARG0_LOAD]], half addrspace(1)* undef, align 2
+; HSA-NEXT:    store volatile half [[ARG1_LOAD]], half addrspace(1)* undef, align 2
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_realign_f16_f16(
@@ -1118,8 +1118,8 @@ define amdgpu_kernel void @kern_realign_f16_f16(half %arg0, half %arg1) #0 {
 ; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
 ; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
 ; MESA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP5]] to half
-; MESA-NEXT:    store volatile half [[ARG0_LOAD]], half addrspace(1)* undef
-; MESA-NEXT:    store volatile half [[ARG1_LOAD]], half addrspace(1)* undef
+; MESA-NEXT:    store volatile half [[ARG0_LOAD]], half addrspace(1)* undef, align 2
+; MESA-NEXT:    store volatile half [[ARG1_LOAD]], half addrspace(1)* undef, align 2
 ; MESA-NEXT:    ret void
 ;
   store volatile half %arg0, half addrspace(1)* undef
@@ -1133,7 +1133,7 @@ define amdgpu_kernel void @kern_global_ptr(i8 addrspace(1)* %ptr) #0 {
 ; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
 ; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
 ; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
-; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_global_ptr(
@@ -1141,7 +1141,7 @@ define amdgpu_kernel void @kern_global_ptr(i8 addrspace(1)* %ptr) #0 {
 ; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
 ; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
 ; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
-; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
 ; MESA-NEXT:    ret void
 ;
   store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
@@ -1154,7 +1154,7 @@ define amdgpu_kernel void @kern_global_ptr_dereferencable(i8 addrspace(1)* deref
 ; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]], i64 0
 ; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
 ; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !dereferenceable !1
-; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_global_ptr_dereferencable(
@@ -1162,7 +1162,7 @@ define amdgpu_kernel void @kern_global_ptr_dereferencable(i8 addrspace(1)* deref
 ; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]], i64 36
 ; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
 ; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !dereferenceable !1
-; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
 ; MESA-NEXT:    ret void
 ;
   store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
@@ -1175,7 +1175,7 @@ define amdgpu_kernel void @kern_global_ptr_dereferencable_or_null(i8 addrspace(1
 ; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]], i64 0
 ; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
 ; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !dereferenceable_or_null !2
-; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_global_ptr_dereferencable_or_null(
@@ -1183,7 +1183,7 @@ define amdgpu_kernel void @kern_global_ptr_dereferencable_or_null(i8 addrspace(1
 ; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]], i64 36
 ; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
 ; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !dereferenceable_or_null !2
-; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
 ; MESA-NEXT:    ret void
 ;
   store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
@@ -1196,7 +1196,7 @@ define amdgpu_kernel void @kern_nonnull_global_ptr(i8 addrspace(1)* nonnull %ptr
 ; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
 ; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
 ; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !nonnull !0
-; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_nonnull_global_ptr(
@@ -1204,7 +1204,7 @@ define amdgpu_kernel void @kern_nonnull_global_ptr(i8 addrspace(1)* nonnull %ptr
 ; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
 ; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
 ; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !nonnull !0
-; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
 ; MESA-NEXT:    ret void
 ;
   store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
@@ -1217,7 +1217,7 @@ define amdgpu_kernel void @kern_align32_global_ptr(i8 addrspace(1)* align 1024 %
 ; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
 ; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
 ; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !align !3
-; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_align32_global_ptr(
@@ -1225,7 +1225,7 @@ define amdgpu_kernel void @kern_align32_global_ptr(i8 addrspace(1)* align 1024 %
 ; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
 ; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
 ; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !align !3
-; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
 ; MESA-NEXT:    ret void
 ;
   store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
@@ -1235,12 +1235,12 @@ define amdgpu_kernel void @kern_align32_global_ptr(i8 addrspace(1)* align 1024 %
 define amdgpu_kernel void @kern_noalias_global_ptr(i8 addrspace(1)* noalias %ptr) #0 {
 ; HSA-LABEL: @kern_noalias_global_ptr(
 ; HSA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
-; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR:%.*]], i8 addrspace(1)* addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_noalias_global_ptr(
 ; MESA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
-; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR:%.*]], i8 addrspace(1)* addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
 ; MESA-NEXT:    ret void
 ;
   store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
@@ -1250,14 +1250,14 @@ define amdgpu_kernel void @kern_noalias_global_ptr(i8 addrspace(1)* noalias %ptr
 define amdgpu_kernel void @kern_noalias_global_ptr_x2(i8 addrspace(1)* noalias %ptr0, i8 addrspace(1)* noalias %ptr1) #0 {
 ; HSA-LABEL: @kern_noalias_global_ptr_x2(
 ; HSA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
-; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR0:%.*]], i8 addrspace(1)* addrspace(1)* undef
-; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR1:%.*]], i8 addrspace(1)* addrspace(1)* undef
+; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR0:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
+; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR1:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @kern_noalias_global_ptr_x2(
 ; MESA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
-; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR0:%.*]], i8 addrspace(1)* addrspace(1)* undef
-; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR1:%.*]], i8 addrspace(1)* addrspace(1)* undef
+; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR0:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
+; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR1:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
 ; MESA-NEXT:    ret void
 ;
   store volatile i8 addrspace(1)* %ptr0, i8 addrspace(1)* addrspace(1)* undef
@@ -1414,7 +1414,7 @@ define amdgpu_kernel void @empty_struct_with_other({} %empty, i32 %arg1) #0 {
 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT]], i64 0
 ; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)*
 ; HSA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
-; HSA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef
+; HSA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef, align 4
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @empty_struct_with_other(
@@ -1422,7 +1422,7 @@ define amdgpu_kernel void @empty_struct_with_other({} %empty, i32 %arg1) #0 {
 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT]], i64 36
 ; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)*
 ; MESA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
-; MESA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef
+; MESA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef, align 4
 ; MESA-NEXT:    ret void
 ;
   store i32 %arg1, i32 addrspace(1)* undef
@@ -1432,21 +1432,21 @@ define amdgpu_kernel void @empty_struct_with_other({} %empty, i32 %arg1) #0 {
 ; Should insert code after the allocas
 define amdgpu_kernel void @static_alloca_kern_i32(i32 %arg0) {
 ; HSA-LABEL: @static_alloca_kern_i32(
-; HSA-NEXT:    [[ALLOCA:%.*]] = alloca i32, addrspace(5)
+; HSA-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
 ; HSA-NEXT:    [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT]], i64 0
 ; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
 ; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
-; HSA-NEXT:    store volatile i32 [[ARG0_LOAD]], i32 addrspace(5)* [[ALLOCA]]
+; HSA-NEXT:    store volatile i32 [[ARG0_LOAD]], i32 addrspace(5)* [[ALLOCA]], align 4
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @static_alloca_kern_i32(
-; MESA-NEXT:    [[ALLOCA:%.*]] = alloca i32, addrspace(5)
+; MESA-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
 ; MESA-NEXT:    [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT]], i64 36
 ; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
 ; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
-; MESA-NEXT:    store volatile i32 [[ARG0_LOAD]], i32 addrspace(5)* [[ALLOCA]]
+; MESA-NEXT:    store volatile i32 [[ARG0_LOAD]], i32 addrspace(5)* [[ALLOCA]], align 4
 ; MESA-NEXT:    ret void
 ;
   %alloca = alloca i32, addrspace(5)
@@ -1458,25 +1458,25 @@ define amdgpu_kernel void @static_alloca_kern_i32(i32 %arg0) {
 ; kernargs.
 define amdgpu_kernel void @dyn_alloca_kernarg_i32(i32 %n) {
 ; HSA-LABEL: @dyn_alloca_kernarg_i32(
-; HSA-NEXT:    [[ALLOCA0:%.*]] = alloca i32, addrspace(5)
+; HSA-NEXT:    [[ALLOCA0:%.*]] = alloca i32, align 4, addrspace(5)
 ; HSA-NEXT:    [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
 ; HSA-NEXT:    [[N_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT]], i64 0
 ; HSA-NEXT:    [[N_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[N_KERNARG_OFFSET]] to i32 addrspace(4)*
 ; HSA-NEXT:    [[N_LOAD:%.*]] = load i32, i32 addrspace(4)* [[N_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
-; HSA-NEXT:    [[ALLOCA1:%.*]] = alloca i32, i32 [[N_LOAD]], addrspace(5)
-; HSA-NEXT:    store volatile i32 0, i32 addrspace(5)* [[ALLOCA0]]
-; HSA-NEXT:    store volatile i32 1, i32 addrspace(5)* [[ALLOCA1]]
+; HSA-NEXT:    [[ALLOCA1:%.*]] = alloca i32, i32 [[N_LOAD]], align 4, addrspace(5)
+; HSA-NEXT:    store volatile i32 0, i32 addrspace(5)* [[ALLOCA0]], align 4
+; HSA-NEXT:    store volatile i32 1, i32 addrspace(5)* [[ALLOCA1]], align 4
 ; HSA-NEXT:    ret void
 ;
 ; MESA-LABEL: @dyn_alloca_kernarg_i32(
-; MESA-NEXT:    [[ALLOCA0:%.*]] = alloca i32, addrspace(5)
+; MESA-NEXT:    [[ALLOCA0:%.*]] = alloca i32, align 4, addrspace(5)
 ; MESA-NEXT:    [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
 ; MESA-NEXT:    [[N_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT]], i64 36
 ; MESA-NEXT:    [[N_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[N_KERNARG_OFFSET]] to i32 addrspace(4)*
 ; MESA-NEXT:    [[N_LOAD:%.*]] = load i32, i32 addrspace(4)* [[N_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
-; MESA-NEXT:    [[ALLOCA1:%.*]] = alloca i32, i32 [[N_LOAD]], addrspace(5)
-; MESA-NEXT:    store volatile i32 0, i32 addrspace(5)* [[ALLOCA0]]
-; MESA-NEXT:    store volatile i32 1, i32 addrspace(5)* [[ALLOCA1]]
+; MESA-NEXT:    [[ALLOCA1:%.*]] = alloca i32, i32 [[N_LOAD]], align 4, addrspace(5)
+; MESA-NEXT:    store volatile i32 0, i32 addrspace(5)* [[ALLOCA0]], align 4
+; MESA-NEXT:    store volatile i32 1, i32 addrspace(5)* [[ALLOCA1]], align 4
 ; MESA-NEXT:    ret void
 ;
   %alloca0 = alloca i32, addrspace(5)

diff  --git a/llvm/test/Transforms/ArgumentPromotion/attrs.ll b/llvm/test/Transforms/ArgumentPromotion/attrs.ll
index 7d801ebf60ff..07b96038d62a 100644
--- a/llvm/test/Transforms/ArgumentPromotion/attrs.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/attrs.ll
@@ -9,16 +9,16 @@ define internal void @f(%struct.ss* byval %b, i32* byval %X, i32 %i) nounwind {
 ; CHECK-LABEL: define {{[^@]+}}@f
 ; CHECK-SAME: (i32 [[B_0:%.*]], i64 [[B_1:%.*]], i32* byval [[X:%.*]], i32 [[I:%.*]])
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_SS:%.*]]
+; CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
-; CHECK-NEXT:    store i32 [[B_0]], i32* [[DOT0]]
+; CHECK-NEXT:    store i32 [[B_0]], i32* [[DOT0]], align 4
 ; CHECK-NEXT:    [[DOT1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 1
-; CHECK-NEXT:    store i64 [[B_1]], i64* [[DOT1]]
+; CHECK-NEXT:    store i64 [[B_1]], i64* [[DOT1]], align 4
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
 ; CHECK-NEXT:    store i32 [[TMP2]], i32* [[TMP]], align 4
-; CHECK-NEXT:    store i32 0, i32* [[X]]
+; CHECK-NEXT:    store i32 0, i32* [[X]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -37,15 +37,15 @@ define i32 @test(i32* %X) {
 ; CHECK-LABEL: define {{[^@]+}}@test
 ; CHECK-SAME: (i32* [[X:%.*]])
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]]
+; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
 ; CHECK-NEXT:    store i32 1, i32* [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
 ; CHECK-NEXT:    store i64 2, i64* [[TMP4]], align 4
 ; CHECK-NEXT:    [[S_0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
-; CHECK-NEXT:    [[S_0_VAL:%.*]] = load i32, i32* [[S_0]]
+; CHECK-NEXT:    [[S_0_VAL:%.*]] = load i32, i32* [[S_0]], align 4
 ; CHECK-NEXT:    [[S_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[S_1_VAL:%.*]] = load i64, i64* [[S_1]]
+; CHECK-NEXT:    [[S_1_VAL:%.*]] = load i64, i64* [[S_1]], align 4
 ; CHECK-NEXT:    call void @f(i32 [[S_0_VAL]], i64 [[S_1_VAL]], i32* byval [[X]], i32 zeroext 0)
 ; CHECK-NEXT:    ret i32 0
 ;

diff  --git a/llvm/test/Transforms/ArgumentPromotion/byval-2.ll b/llvm/test/Transforms/ArgumentPromotion/byval-2.ll
index f46a1f371cc9..f95493f60688 100644
--- a/llvm/test/Transforms/ArgumentPromotion/byval-2.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/byval-2.ll
@@ -11,16 +11,16 @@ define internal void @f(%struct.ss* byval  %b, i32* byval %X) nounwind  {
 ; CHECK-LABEL: define {{[^@]+}}@f
 ; CHECK-SAME: (i32 [[B_0:%.*]], i64 [[B_1:%.*]], i32* byval [[X:%.*]])
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_SS:%.*]]
+; CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
-; CHECK-NEXT:    store i32 [[B_0]], i32* [[DOT0]]
+; CHECK-NEXT:    store i32 [[B_0]], i32* [[DOT0]], align 4
 ; CHECK-NEXT:    [[DOT1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 1
-; CHECK-NEXT:    store i64 [[B_1]], i64* [[DOT1]]
+; CHECK-NEXT:    store i64 [[B_1]], i64* [[DOT1]], align 4
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
 ; CHECK-NEXT:    store i32 [[TMP2]], i32* [[TMP]], align 4
-; CHECK-NEXT:    store i32 0, i32* [[X]]
+; CHECK-NEXT:    store i32 0, i32* [[X]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -37,15 +37,15 @@ define i32 @test(i32* %X) {
 ; CHECK-LABEL: define {{[^@]+}}@test
 ; CHECK-SAME: (i32* [[X:%.*]])
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]]
+; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
 ; CHECK-NEXT:    store i32 1, i32* [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
 ; CHECK-NEXT:    store i64 2, i64* [[TMP4]], align 4
 ; CHECK-NEXT:    [[S_0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
-; CHECK-NEXT:    [[S_0_VAL:%.*]] = load i32, i32* [[S_0]]
+; CHECK-NEXT:    [[S_0_VAL:%.*]] = load i32, i32* [[S_0]], align 4
 ; CHECK-NEXT:    [[S_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[S_1_VAL:%.*]] = load i64, i64* [[S_1]]
+; CHECK-NEXT:    [[S_1_VAL:%.*]] = load i64, i64* [[S_1]], align 4
 ; CHECK-NEXT:    call void @f(i32 [[S_0_VAL]], i64 [[S_1_VAL]], i32* byval [[X]])
 ; CHECK-NEXT:    ret i32 0
 ;

diff  --git a/llvm/test/Transforms/ArgumentPromotion/byval.ll b/llvm/test/Transforms/ArgumentPromotion/byval.ll
index 020c0f292be8..359ce4ef848c 100644
--- a/llvm/test/Transforms/ArgumentPromotion/byval.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/byval.ll
@@ -10,11 +10,11 @@ define internal void @f(%struct.ss* byval  %b) nounwind  {
 ; CHECK-LABEL: define {{[^@]+}}@f
 ; CHECK-SAME: (i32 [[B_0:%.*]], i64 [[B_1:%.*]])
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_SS:%.*]]
+; CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
 ; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
-; CHECK-NEXT:    store i32 [[B_0]], i32* [[DOT0]]
+; CHECK-NEXT:    store i32 [[B_0]], i32* [[DOT0]], align 4
 ; CHECK-NEXT:    [[DOT1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 1
-; CHECK-NEXT:    store i64 [[B_1]], i64* [[DOT1]]
+; CHECK-NEXT:    store i64 [[B_1]], i64* [[DOT1]], align 4
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
@@ -36,9 +36,9 @@ define internal void @g(%struct.ss* byval align 32 %b) nounwind {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_SS:%.*]], align 32
 ; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
-; CHECK-NEXT:    store i32 [[B_0]], i32* [[DOT0]]
+; CHECK-NEXT:    store i32 [[B_0]], i32* [[DOT0]], align 4
 ; CHECK-NEXT:    [[DOT1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 1
-; CHECK-NEXT:    store i64 [[B_1]], i64* [[DOT1]]
+; CHECK-NEXT:    store i64 [[B_1]], i64* [[DOT1]], align 4
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B]], i32 0, i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
@@ -57,20 +57,20 @@ entry:
 define i32 @main() nounwind  {
 ; CHECK-LABEL: define {{[^@]+}}@main()
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]]
+; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
 ; CHECK-NEXT:    store i32 1, i32* [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
 ; CHECK-NEXT:    store i64 2, i64* [[TMP4]], align 4
 ; CHECK-NEXT:    [[S_0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
-; CHECK-NEXT:    [[S_0_VAL:%.*]] = load i32, i32* [[S_0]]
+; CHECK-NEXT:    [[S_0_VAL:%.*]] = load i32, i32* [[S_0]], align 4
 ; CHECK-NEXT:    [[S_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[S_1_VAL:%.*]] = load i64, i64* [[S_1]]
+; CHECK-NEXT:    [[S_1_VAL:%.*]] = load i64, i64* [[S_1]], align 4
 ; CHECK-NEXT:    call void @f(i32 [[S_0_VAL]], i64 [[S_1_VAL]])
 ; CHECK-NEXT:    [[S_01:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
-; CHECK-NEXT:    [[S_01_VAL:%.*]] = load i32, i32* [[S_01]]
+; CHECK-NEXT:    [[S_01_VAL:%.*]] = load i32, i32* [[S_01]], align 4
 ; CHECK-NEXT:    [[S_12:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[S_12_VAL:%.*]] = load i64, i64* [[S_12]]
+; CHECK-NEXT:    [[S_12_VAL:%.*]] = load i64, i64* [[S_12]], align 4
 ; CHECK-NEXT:    call void @g(i32 [[S_01_VAL]], i64 [[S_12_VAL]])
 ; CHECK-NEXT:    ret i32 0
 ;

diff  --git a/llvm/test/Transforms/ArgumentPromotion/dbg.ll b/llvm/test/Transforms/ArgumentPromotion/dbg.ll
index e576ec099c61..1da97c3144e9 100644
--- a/llvm/test/Transforms/ArgumentPromotion/dbg.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/dbg.ll
@@ -21,11 +21,11 @@ define internal void @test(i32** %X) !dbg !2 {
 define internal void @test_byval(%struct.pair* byval %P) {
 ; CHECK-LABEL: define {{[^@]+}}@test_byval
 ; CHECK-SAME: (i32 [[P_0:%.*]], i32 [[P_1:%.*]])
-; CHECK-NEXT:    [[P:%.*]] = alloca [[STRUCT_PAIR:%.*]]
+; CHECK-NEXT:    [[P:%.*]] = alloca [[STRUCT_PAIR:%.*]], align 8
 ; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[STRUCT_PAIR]], %struct.pair* [[P]], i32 0, i32 0
-; CHECK-NEXT:    store i32 [[P_0]], i32* [[DOT0]]
+; CHECK-NEXT:    store i32 [[P_0]], i32* [[DOT0]], align 4
 ; CHECK-NEXT:    [[DOT1:%.*]] = getelementptr [[STRUCT_PAIR]], %struct.pair* [[P]], i32 0, i32 1
-; CHECK-NEXT:    store i32 [[P_1]], i32* [[DOT1]]
+; CHECK-NEXT:    store i32 [[P_1]], i32* [[DOT1]], align 4
 ; CHECK-NEXT:    ret void
 ;
   ret void

diff  --git a/llvm/test/Transforms/ArgumentPromotion/tail.ll b/llvm/test/Transforms/ArgumentPromotion/tail.ll
index f4bb06a6e4ef..62c6f202dc7d 100644
--- a/llvm/test/Transforms/ArgumentPromotion/tail.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/tail.ll
@@ -12,11 +12,11 @@ declare i8* @foo(%pair*)
 define internal void @bar(%pair* byval %Data) {
 ; CHECK-LABEL: define {{[^@]+}}@bar
 ; CHECK-SAME: (i32 [[DATA_0:%.*]], i32 [[DATA_1:%.*]])
-; CHECK-NEXT:    [[DATA:%.*]] = alloca [[PAIR:%.*]]
+; CHECK-NEXT:    [[DATA:%.*]] = alloca [[PAIR:%.*]], align 8
 ; CHECK-NEXT:    [[DOT0:%.*]] = getelementptr [[PAIR]], %pair* [[DATA]], i32 0, i32 0
-; CHECK-NEXT:    store i32 [[DATA_0]], i32* [[DOT0]]
+; CHECK-NEXT:    store i32 [[DATA_0]], i32* [[DOT0]], align 4
 ; CHECK-NEXT:    [[DOT1:%.*]] = getelementptr [[PAIR]], %pair* [[DATA]], i32 0, i32 1
-; CHECK-NEXT:    store i32 [[DATA_1]], i32* [[DOT1]]
+; CHECK-NEXT:    store i32 [[DATA_1]], i32* [[DOT1]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i8* @foo(%pair* [[DATA]])
 ; CHECK-NEXT:    ret void
 ;
@@ -28,9 +28,9 @@ define void @zed(%pair* byval %Data) {
 ; CHECK-LABEL: define {{[^@]+}}@zed
 ; CHECK-SAME: (%pair* byval [[DATA:%.*]])
 ; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr [[PAIR:%.*]], %pair* [[DATA]], i32 0, i32 0
-; CHECK-NEXT:    [[DATA_0_VAL:%.*]] = load i32, i32* [[DATA_0]]
+; CHECK-NEXT:    [[DATA_0_VAL:%.*]] = load i32, i32* [[DATA_0]], align 4
 ; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr [[PAIR]], %pair* [[DATA]], i32 0, i32 1
-; CHECK-NEXT:    [[DATA_1_VAL:%.*]] = load i32, i32* [[DATA_1]]
+; CHECK-NEXT:    [[DATA_1_VAL:%.*]] = load i32, i32* [[DATA_1]], align 4
 ; CHECK-NEXT:    call void @bar(i32 [[DATA_0_VAL]], i32 [[DATA_1_VAL]])
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/attrs.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/attrs.ll
index 401fccc8b034..524b26c2b128 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/attrs.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/attrs.ll
@@ -24,13 +24,13 @@ define internal i32 @f(%struct.ss* byval %b, i32* byval %X, i32 %i) nounwind {
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@f
 ; IS__TUNIT_NPM-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]], i32 [[TMP2:%.*]], i32 [[I:%.*]])
 ; IS__TUNIT_NPM-NEXT:  entry:
-; IS__TUNIT_NPM-NEXT:    [[X_PRIV:%.*]] = alloca i32
-; IS__TUNIT_NPM-NEXT:    store i32 [[TMP2]], i32* [[X_PRIV]]
-; IS__TUNIT_NPM-NEXT:    [[B_PRIV:%.*]] = alloca [[STRUCT_SS:%.*]]
+; IS__TUNIT_NPM-NEXT:    [[X_PRIV:%.*]] = alloca i32, align 4
+; IS__TUNIT_NPM-NEXT:    store i32 [[TMP2]], i32* [[X_PRIV]], align 4
+; IS__TUNIT_NPM-NEXT:    [[B_PRIV:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; IS__TUNIT_NPM-NEXT:    [[B_PRIV_CAST:%.*]] = bitcast %struct.ss* [[B_PRIV]] to i32*
-; IS__TUNIT_NPM-NEXT:    store i32 [[TMP0]], i32* [[B_PRIV_CAST]]
+; IS__TUNIT_NPM-NEXT:    store i32 [[TMP0]], i32* [[B_PRIV_CAST]], align 4
 ; IS__TUNIT_NPM-NEXT:    [[B_PRIV_0_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B_PRIV]], i32 0, i32 1
-; IS__TUNIT_NPM-NEXT:    store i64 [[TMP1]], i64* [[B_PRIV_0_1]]
+; IS__TUNIT_NPM-NEXT:    store i64 [[TMP1]], i64* [[B_PRIV_0_1]], align 4
 ; IS__TUNIT_NPM-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B_PRIV]], i32 0, i32 0
 ; IS__TUNIT_NPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 8
 ; IS__TUNIT_NPM-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
@@ -55,9 +55,9 @@ define internal i32 @f(%struct.ss* byval %b, i32* byval %X, i32 %i) nounwind {
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@f
 ; IS__CGSCC_NPM-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]], i32 [[TMP2:%.*]])
 ; IS__CGSCC_NPM-NEXT:  entry:
-; IS__CGSCC_NPM-NEXT:    [[X_PRIV:%.*]] = alloca i32
+; IS__CGSCC_NPM-NEXT:    [[X_PRIV:%.*]] = alloca i32, align 4
 ; IS__CGSCC_NPM-NEXT:    store i32 [[TMP2]], i32* [[X_PRIV]], align 4
-; IS__CGSCC_NPM-NEXT:    [[B_PRIV:%.*]] = alloca [[STRUCT_SS:%.*]]
+; IS__CGSCC_NPM-NEXT:    [[B_PRIV:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; IS__CGSCC_NPM-NEXT:    [[B_PRIV_CAST:%.*]] = bitcast %struct.ss* [[B_PRIV]] to i32*
 ; IS__CGSCC_NPM-NEXT:    store i32 [[TMP0]], i32* [[B_PRIV_CAST]], align 4
 ; IS__CGSCC_NPM-NEXT:    [[B_PRIV_0_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B_PRIV]], i32 0, i32 1
@@ -90,7 +90,7 @@ define i32 @test(i32* %X) {
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@test
 ; IS__TUNIT_OPM-SAME: (i32* nocapture nofree readonly align 4 [[X:%.*]])
 ; IS__TUNIT_OPM-NEXT:  entry:
-; IS__TUNIT_OPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]]
+; IS__TUNIT_OPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
 ; IS__TUNIT_OPM-NEXT:    store i32 1, i32* [[TMP1]], align 8
 ; IS__TUNIT_OPM-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
@@ -101,7 +101,7 @@ define i32 @test(i32* %X) {
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@test
 ; IS__TUNIT_NPM-SAME: (i32* nocapture nofree readonly align 4 [[X:%.*]])
 ; IS__TUNIT_NPM-NEXT:  entry:
-; IS__TUNIT_NPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]]
+; IS__TUNIT_NPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; IS__TUNIT_NPM-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
 ; IS__TUNIT_NPM-NEXT:    store i32 1, i32* [[TMP1]], align 8
 ; IS__TUNIT_NPM-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
@@ -117,7 +117,7 @@ define i32 @test(i32* %X) {
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@test
 ; IS__CGSCC_OPM-SAME: (i32* nocapture nofree nonnull readnone align 4 dereferenceable(4) [[X:%.*]])
 ; IS__CGSCC_OPM-NEXT:  entry:
-; IS__CGSCC_OPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]]
+; IS__CGSCC_OPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
 ; IS__CGSCC_OPM-NEXT:    store i32 1, i32* [[TMP1]], align 8
 ; IS__CGSCC_OPM-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
@@ -128,7 +128,7 @@ define i32 @test(i32* %X) {
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@test
 ; IS__CGSCC_NPM-SAME: (i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[X:%.*]])
 ; IS__CGSCC_NPM-NEXT:  entry:
-; IS__CGSCC_NPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]]
+; IS__CGSCC_NPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; IS__CGSCC_NPM-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
 ; IS__CGSCC_NPM-NEXT:    store i32 1, i32* [[TMP1]], align 8
 ; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1

diff  --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll
index b70d05fcf16b..ccfbb9463f90 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll
@@ -20,9 +20,9 @@ define internal void @f(%struct.ss* byval  %b, i32* byval %X) nounwind  {
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@f
 ; IS__CGSCC_NPM-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]], i32 [[TMP2:%.*]])
 ; IS__CGSCC_NPM-NEXT:  entry:
-; IS__CGSCC_NPM-NEXT:    [[X_PRIV:%.*]] = alloca i32
+; IS__CGSCC_NPM-NEXT:    [[X_PRIV:%.*]] = alloca i32, align 4
 ; IS__CGSCC_NPM-NEXT:    store i32 [[TMP2]], i32* [[X_PRIV]], align 4
-; IS__CGSCC_NPM-NEXT:    [[B_PRIV:%.*]] = alloca [[STRUCT_SS:%.*]]
+; IS__CGSCC_NPM-NEXT:    [[B_PRIV:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; IS__CGSCC_NPM-NEXT:    [[B_PRIV_CAST:%.*]] = bitcast %struct.ss* [[B_PRIV]] to i32*
 ; IS__CGSCC_NPM-NEXT:    store i32 [[TMP0]], i32* [[B_PRIV_CAST]], align 4
 ; IS__CGSCC_NPM-NEXT:    [[B_PRIV_0_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B_PRIV]], i32 0, i32 1
@@ -49,7 +49,7 @@ define i32 @test(i32* %X) {
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@test
 ; IS__TUNIT____-SAME: (i32* nocapture nofree readonly align 4 [[X:%.*]])
 ; IS__TUNIT____-NEXT:  entry:
-; IS__TUNIT____-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]]
+; IS__TUNIT____-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
 ; IS__TUNIT____-NEXT:    store i32 1, i32* [[TMP1]], align 8
 ; IS__TUNIT____-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
@@ -59,7 +59,7 @@ define i32 @test(i32* %X) {
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@test
 ; IS__CGSCC_OPM-SAME: (i32* nocapture nofree nonnull readnone align 4 dereferenceable(4) [[X:%.*]])
 ; IS__CGSCC_OPM-NEXT:  entry:
-; IS__CGSCC_OPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]]
+; IS__CGSCC_OPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
 ; IS__CGSCC_OPM-NEXT:    store i32 1, i32* [[TMP1]], align 8
 ; IS__CGSCC_OPM-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
@@ -69,7 +69,7 @@ define i32 @test(i32* %X) {
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@test
 ; IS__CGSCC_NPM-SAME: (i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[X:%.*]])
 ; IS__CGSCC_NPM-NEXT:  entry:
-; IS__CGSCC_NPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]]
+; IS__CGSCC_NPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; IS__CGSCC_NPM-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
 ; IS__CGSCC_NPM-NEXT:    store i32 1, i32* [[TMP1]], align 8
 ; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1

diff  --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll
index eb6f666012da..194188cdf8ae 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll
@@ -18,33 +18,19 @@ define internal i32 @f(%struct.ss* byval  %b) nounwind  {
 ; IS________OPM-NEXT:    store i32 [[TMP2]], i32* [[TMP]], align 8
 ; IS________OPM-NEXT:    ret i32 [[TMP1]]
 ;
-; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@f
-; IS__TUNIT_NPM-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]])
-; IS__TUNIT_NPM-NEXT:  entry:
-; IS__TUNIT_NPM-NEXT:    [[B_PRIV:%.*]] = alloca [[STRUCT_SS:%.*]]
-; IS__TUNIT_NPM-NEXT:    [[B_PRIV_CAST:%.*]] = bitcast %struct.ss* [[B_PRIV]] to i32*
-; IS__TUNIT_NPM-NEXT:    store i32 [[TMP0]], i32* [[B_PRIV_CAST]]
-; IS__TUNIT_NPM-NEXT:    [[B_PRIV_0_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B_PRIV]], i32 0, i32 1
-; IS__TUNIT_NPM-NEXT:    store i64 [[TMP1]], i64* [[B_PRIV_0_1]]
-; IS__TUNIT_NPM-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B_PRIV]], i32 0, i32 0
-; IS__TUNIT_NPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 8
-; IS__TUNIT_NPM-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
-; IS__TUNIT_NPM-NEXT:    store i32 [[TMP2]], i32* [[TMP]], align 8
-; IS__TUNIT_NPM-NEXT:    ret i32 [[TMP1]]
-;
-; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@f
-; IS__CGSCC_NPM-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]])
-; IS__CGSCC_NPM-NEXT:  entry:
-; IS__CGSCC_NPM-NEXT:    [[B_PRIV:%.*]] = alloca [[STRUCT_SS:%.*]]
-; IS__CGSCC_NPM-NEXT:    [[B_PRIV_CAST:%.*]] = bitcast %struct.ss* [[B_PRIV]] to i32*
-; IS__CGSCC_NPM-NEXT:    store i32 [[TMP0]], i32* [[B_PRIV_CAST]], align 4
-; IS__CGSCC_NPM-NEXT:    [[B_PRIV_0_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B_PRIV]], i32 0, i32 1
-; IS__CGSCC_NPM-NEXT:    store i64 [[TMP1]], i64* [[B_PRIV_0_1]], align 4
-; IS__CGSCC_NPM-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B_PRIV]], i32 0, i32 0
-; IS__CGSCC_NPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 8
-; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
-; IS__CGSCC_NPM-NEXT:    store i32 [[TMP2]], i32* [[TMP]], align 8
-; IS__CGSCC_NPM-NEXT:    ret i32 [[TMP1]]
+; IS________NPM-LABEL: define {{[^@]+}}@f
+; IS________NPM-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]])
+; IS________NPM-NEXT:  entry:
+; IS________NPM-NEXT:    [[B_PRIV:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
+; IS________NPM-NEXT:    [[B_PRIV_CAST:%.*]] = bitcast %struct.ss* [[B_PRIV]] to i32*
+; IS________NPM-NEXT:    store i32 [[TMP0]], i32* [[B_PRIV_CAST]], align 4
+; IS________NPM-NEXT:    [[B_PRIV_0_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B_PRIV]], i32 0, i32 1
+; IS________NPM-NEXT:    store i64 [[TMP1]], i64* [[B_PRIV_0_1]], align 4
+; IS________NPM-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B_PRIV]], i32 0, i32 0
+; IS________NPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 8
+; IS________NPM-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
+; IS________NPM-NEXT:    store i32 [[TMP2]], i32* [[TMP]], align 8
+; IS________NPM-NEXT:    ret i32 [[TMP1]]
 ;
 entry:
   %tmp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
@@ -65,33 +51,19 @@ define internal i32 @g(%struct.ss* byval align 32 %b) nounwind {
 ; IS________OPM-NEXT:    store i32 [[TMP2]], i32* [[TMP]], align 32
 ; IS________OPM-NEXT:    ret i32 [[TMP2]]
 ;
-; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@g
-; IS__TUNIT_NPM-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]])
-; IS__TUNIT_NPM-NEXT:  entry:
-; IS__TUNIT_NPM-NEXT:    [[B_PRIV:%.*]] = alloca [[STRUCT_SS:%.*]]
-; IS__TUNIT_NPM-NEXT:    [[B_PRIV_CAST:%.*]] = bitcast %struct.ss* [[B_PRIV]] to i32*
-; IS__TUNIT_NPM-NEXT:    store i32 [[TMP0]], i32* [[B_PRIV_CAST]]
-; IS__TUNIT_NPM-NEXT:    [[B_PRIV_0_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B_PRIV]], i32 0, i32 1
-; IS__TUNIT_NPM-NEXT:    store i64 [[TMP1]], i64* [[B_PRIV_0_1]]
-; IS__TUNIT_NPM-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B_PRIV]], i32 0, i32 0
-; IS__TUNIT_NPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 32
-; IS__TUNIT_NPM-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
-; IS__TUNIT_NPM-NEXT:    store i32 [[TMP2]], i32* [[TMP]], align 32
-; IS__TUNIT_NPM-NEXT:    ret i32 [[TMP2]]
-;
-; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@g
-; IS__CGSCC_NPM-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]])
-; IS__CGSCC_NPM-NEXT:  entry:
-; IS__CGSCC_NPM-NEXT:    [[B_PRIV:%.*]] = alloca [[STRUCT_SS:%.*]]
-; IS__CGSCC_NPM-NEXT:    [[B_PRIV_CAST:%.*]] = bitcast %struct.ss* [[B_PRIV]] to i32*
-; IS__CGSCC_NPM-NEXT:    store i32 [[TMP0]], i32* [[B_PRIV_CAST]], align 4
-; IS__CGSCC_NPM-NEXT:    [[B_PRIV_0_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B_PRIV]], i32 0, i32 1
-; IS__CGSCC_NPM-NEXT:    store i64 [[TMP1]], i64* [[B_PRIV_0_1]], align 4
-; IS__CGSCC_NPM-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B_PRIV]], i32 0, i32 0
-; IS__CGSCC_NPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 32
-; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
-; IS__CGSCC_NPM-NEXT:    store i32 [[TMP2]], i32* [[TMP]], align 32
-; IS__CGSCC_NPM-NEXT:    ret i32 [[TMP2]]
+; IS________NPM-LABEL: define {{[^@]+}}@g
+; IS________NPM-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]])
+; IS________NPM-NEXT:  entry:
+; IS________NPM-NEXT:    [[B_PRIV:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
+; IS________NPM-NEXT:    [[B_PRIV_CAST:%.*]] = bitcast %struct.ss* [[B_PRIV]] to i32*
+; IS________NPM-NEXT:    store i32 [[TMP0]], i32* [[B_PRIV_CAST]], align 4
+; IS________NPM-NEXT:    [[B_PRIV_0_1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B_PRIV]], i32 0, i32 1
+; IS________NPM-NEXT:    store i64 [[TMP1]], i64* [[B_PRIV_0_1]], align 4
+; IS________NPM-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[B_PRIV]], i32 0, i32 0
+; IS________NPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 32
+; IS________NPM-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
+; IS________NPM-NEXT:    store i32 [[TMP2]], i32* [[TMP]], align 32
+; IS________NPM-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
   %tmp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
@@ -105,7 +77,7 @@ entry:
 define i32 @main() nounwind  {
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@main()
 ; IS__TUNIT_OPM-NEXT:  entry:
-; IS__TUNIT_OPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]]
+; IS__TUNIT_OPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
 ; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
 ; IS__TUNIT_OPM-NEXT:    store i32 1, i32* [[TMP1]], align 8
 ; IS__TUNIT_OPM-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
@@ -117,7 +89,7 @@ define i32 @main() nounwind  {
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@main()
 ; IS__TUNIT_NPM-NEXT:  entry:
-; IS__TUNIT_NPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]]
+; IS__TUNIT_NPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
 ; IS__TUNIT_NPM-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
 ; IS__TUNIT_NPM-NEXT:    store i32 1, i32* [[TMP1]], align 8
 ; IS__TUNIT_NPM-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
@@ -137,7 +109,7 @@ define i32 @main() nounwind  {
 ;
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@main()
 ; IS__CGSCC_OPM-NEXT:  entry:
-; IS__CGSCC_OPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]]
+; IS__CGSCC_OPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
 ; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
 ; IS__CGSCC_OPM-NEXT:    store i32 1, i32* [[TMP1]], align 32
 ; IS__CGSCC_OPM-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
@@ -149,7 +121,7 @@ define i32 @main() nounwind  {
 ;
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@main()
 ; IS__CGSCC_NPM-NEXT:  entry:
-; IS__CGSCC_NPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]]
+; IS__CGSCC_NPM-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
 ; IS__CGSCC_NPM-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
 ; IS__CGSCC_NPM-NEXT:    store i32 1, i32* [[TMP1]], align 32
 ; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1

diff  --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/fp80.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/fp80.ll
index 129146dc3179..5c2ed40cf590 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/fp80.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/fp80.ll
@@ -75,12 +75,12 @@ define internal i64 @CaptureAStruct(%struct.Foo* byval %a) {
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@CaptureAStruct
 ; IS__CGSCC____-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]])
 ; IS__CGSCC____-NEXT:  entry:
-; IS__CGSCC____-NEXT:    [[A_PRIV:%.*]] = alloca [[STRUCT_FOO:%.*]]
+; IS__CGSCC____-NEXT:    [[A_PRIV:%.*]] = alloca [[STRUCT_FOO:%.*]], align 8
 ; IS__CGSCC____-NEXT:    [[A_PRIV_CAST:%.*]] = bitcast %struct.Foo* [[A_PRIV]] to i32*
 ; IS__CGSCC____-NEXT:    store i32 [[TMP0]], i32* [[A_PRIV_CAST]], align 4
 ; IS__CGSCC____-NEXT:    [[A_PRIV_0_1:%.*]] = getelementptr [[STRUCT_FOO]], %struct.Foo* [[A_PRIV]], i32 0, i32 1
 ; IS__CGSCC____-NEXT:    store i64 [[TMP1]], i64* [[A_PRIV_0_1]], align 8
-; IS__CGSCC____-NEXT:    [[A_PTR:%.*]] = alloca %struct.Foo*
+; IS__CGSCC____-NEXT:    [[A_PTR:%.*]] = alloca %struct.Foo*, align 8
 ; IS__CGSCC____-NEXT:    br label [[LOOP:%.*]]
 ; IS__CGSCC____:       loop:
 ; IS__CGSCC____-NEXT:    [[PHI:%.*]] = phi %struct.Foo* [ null, [[ENTRY:%.*]] ], [ [[GEP:%.*]], [[LOOP]] ]

diff  --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/inalloca.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/inalloca.ll
index 3f82fb7f387f..f8625b256236 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/inalloca.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/inalloca.ll
@@ -42,7 +42,7 @@ entry:
 define i32 @main() {
 ; CHECK-LABEL: define {{[^@]+}}@main()
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[S:%.*]] = alloca inalloca [[STRUCT_SS:%.*]]
+; CHECK-NEXT:    [[S:%.*]] = alloca inalloca [[STRUCT_SS:%.*]], align 4
 ; CHECK-NEXT:    [[F0:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 0
 ; CHECK-NEXT:    [[F1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
 ; CHECK-NEXT:    store i32 1, i32* [[F0]], align 4

diff  --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/tail.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/tail.ll
index 436d5af64650..1920e0108f51 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/tail.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/tail.ll
@@ -19,17 +19,17 @@ define internal void @bar(%pair* byval %Data) {
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@bar
 ; IS__TUNIT_NPM-SAME: (i32 [[TMP0:%.*]], i32 [[TMP1:%.*]])
-; IS__TUNIT_NPM-NEXT:    [[DATA_PRIV:%.*]] = alloca [[PAIR:%.*]]
+; IS__TUNIT_NPM-NEXT:    [[DATA_PRIV:%.*]] = alloca [[PAIR:%.*]], align 8
 ; IS__TUNIT_NPM-NEXT:    [[DATA_PRIV_CAST:%.*]] = bitcast %pair* [[DATA_PRIV]] to i32*
-; IS__TUNIT_NPM-NEXT:    store i32 [[TMP0]], i32* [[DATA_PRIV_CAST]]
+; IS__TUNIT_NPM-NEXT:    store i32 [[TMP0]], i32* [[DATA_PRIV_CAST]], align 4
 ; IS__TUNIT_NPM-NEXT:    [[DATA_PRIV_0_1:%.*]] = getelementptr [[PAIR]], %pair* [[DATA_PRIV]], i32 0, i32 1
-; IS__TUNIT_NPM-NEXT:    store i32 [[TMP1]], i32* [[DATA_PRIV_0_1]]
+; IS__TUNIT_NPM-NEXT:    store i32 [[TMP1]], i32* [[DATA_PRIV_0_1]], align 4
 ; IS__TUNIT_NPM-NEXT:    [[TMP3:%.*]] = call i8* @foo(%pair* nonnull dereferenceable(8) [[DATA_PRIV]])
 ; IS__TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@bar
 ; IS__CGSCC_NPM-SAME: (i32 [[TMP0:%.*]], i32 [[TMP1:%.*]])
-; IS__CGSCC_NPM-NEXT:    [[DATA_PRIV:%.*]] = alloca [[PAIR:%.*]]
+; IS__CGSCC_NPM-NEXT:    [[DATA_PRIV:%.*]] = alloca [[PAIR:%.*]], align 8
 ; IS__CGSCC_NPM-NEXT:    [[DATA_PRIV_CAST:%.*]] = bitcast %pair* [[DATA_PRIV]] to i32*
 ; IS__CGSCC_NPM-NEXT:    store i32 [[TMP0]], i32* [[DATA_PRIV_CAST]], align 4
 ; IS__CGSCC_NPM-NEXT:    [[DATA_PRIV_0_1:%.*]] = getelementptr [[PAIR]], %pair* [[DATA_PRIV]], i32 0, i32 1

diff  --git a/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll b/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll
index 9a680a20d162..e7d9a63f7076 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll
@@ -23,7 +23,7 @@ define internal void @vfu1(%struct.MYstr* byval align 4 %u) nounwind {
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@vfu1
 ; IS__CGSCC_NPM-SAME: (i8 [[TMP0:%.*]], i32 [[TMP1:%.*]])
 ; IS__CGSCC_NPM-NEXT:  entry:
-; IS__CGSCC_NPM-NEXT:    [[U_PRIV:%.*]] = alloca [[STRUCT_MYSTR:%.*]]
+; IS__CGSCC_NPM-NEXT:    [[U_PRIV:%.*]] = alloca [[STRUCT_MYSTR:%.*]], align 8
 ; IS__CGSCC_NPM-NEXT:    [[U_PRIV_CAST:%.*]] = bitcast %struct.MYstr* [[U_PRIV]] to i8*
 ; IS__CGSCC_NPM-NEXT:    store i8 [[TMP0]], i8* [[U_PRIV_CAST]], align 1
 ; IS__CGSCC_NPM-NEXT:    [[U_PRIV_0_1:%.*]] = getelementptr [[STRUCT_MYSTR]], %struct.MYstr* [[U_PRIV]], i32 0, i32 1
@@ -62,11 +62,11 @@ define internal i32 @vfu2(%struct.MYstr* byval align 4 %u) nounwind readonly {
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@vfu2
 ; IS__TUNIT_NPM-SAME: (i8 [[TMP0:%.*]], i32 [[TMP1:%.*]])
 ; IS__TUNIT_NPM-NEXT:  entry:
-; IS__TUNIT_NPM-NEXT:    [[U_PRIV:%.*]] = alloca [[STRUCT_MYSTR:%.*]]
+; IS__TUNIT_NPM-NEXT:    [[U_PRIV:%.*]] = alloca [[STRUCT_MYSTR:%.*]], align 8
 ; IS__TUNIT_NPM-NEXT:    [[U_PRIV_CAST:%.*]] = bitcast %struct.MYstr* [[U_PRIV]] to i8*
-; IS__TUNIT_NPM-NEXT:    store i8 [[TMP0]], i8* [[U_PRIV_CAST]]
+; IS__TUNIT_NPM-NEXT:    store i8 [[TMP0]], i8* [[U_PRIV_CAST]], align 1
 ; IS__TUNIT_NPM-NEXT:    [[U_PRIV_0_1:%.*]] = getelementptr [[STRUCT_MYSTR]], %struct.MYstr* [[U_PRIV]], i32 0, i32 1
-; IS__TUNIT_NPM-NEXT:    store i32 [[TMP1]], i32* [[U_PRIV_0_1]]
+; IS__TUNIT_NPM-NEXT:    store i32 [[TMP1]], i32* [[U_PRIV_0_1]], align 4
 ; IS__TUNIT_NPM-NEXT:    [[TMP2:%.*]] = getelementptr [[STRUCT_MYSTR]], %struct.MYstr* @mystr, i32 0, i32 1
 ; IS__TUNIT_NPM-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
 ; IS__TUNIT_NPM-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_MYSTR]], %struct.MYstr* @mystr, i32 0, i32 0
@@ -135,41 +135,23 @@ define internal i32 @vfu2_v2(%struct.MYstr* byval align 4 %u) nounwind readonly
 ; IS________OPM-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], [[TMP1]]
 ; IS________OPM-NEXT:    ret i32 [[TMP5]]
 ;
-; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@vfu2_v2
-; IS__TUNIT_NPM-SAME: (i8 [[TMP0:%.*]], i32 [[TMP1:%.*]])
-; IS__TUNIT_NPM-NEXT:  entry:
-; IS__TUNIT_NPM-NEXT:    [[U_PRIV:%.*]] = alloca [[STRUCT_MYSTR:%.*]]
-; IS__TUNIT_NPM-NEXT:    [[U_PRIV_CAST:%.*]] = bitcast %struct.MYstr* [[U_PRIV]] to i8*
-; IS__TUNIT_NPM-NEXT:    store i8 [[TMP0]], i8* [[U_PRIV_CAST]]
-; IS__TUNIT_NPM-NEXT:    [[U_PRIV_0_1:%.*]] = getelementptr [[STRUCT_MYSTR]], %struct.MYstr* [[U_PRIV]], i32 0, i32 1
-; IS__TUNIT_NPM-NEXT:    store i32 [[TMP1]], i32* [[U_PRIV_0_1]]
-; IS__TUNIT_NPM-NEXT:    [[Z:%.*]] = getelementptr [[STRUCT_MYSTR]], %struct.MYstr* [[U_PRIV]], i32 0, i32 1
-; IS__TUNIT_NPM-NEXT:    store i32 99, i32* [[Z]], align 4
-; IS__TUNIT_NPM-NEXT:    [[TMP2:%.*]] = getelementptr [[STRUCT_MYSTR]], %struct.MYstr* [[U_PRIV]], i32 0, i32 1
-; IS__TUNIT_NPM-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
-; IS__TUNIT_NPM-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_MYSTR]], %struct.MYstr* [[U_PRIV]], i32 0, i32 0
-; IS__TUNIT_NPM-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 8
-; IS__TUNIT_NPM-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
-; IS__TUNIT_NPM-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP3]]
-; IS__TUNIT_NPM-NEXT:    ret i32 [[TMP7]]
-;
-; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@vfu2_v2
-; IS__CGSCC_NPM-SAME: (i8 [[TMP0:%.*]], i32 [[TMP1:%.*]])
-; IS__CGSCC_NPM-NEXT:  entry:
-; IS__CGSCC_NPM-NEXT:    [[U_PRIV:%.*]] = alloca [[STRUCT_MYSTR:%.*]]
-; IS__CGSCC_NPM-NEXT:    [[U_PRIV_CAST:%.*]] = bitcast %struct.MYstr* [[U_PRIV]] to i8*
-; IS__CGSCC_NPM-NEXT:    store i8 [[TMP0]], i8* [[U_PRIV_CAST]], align 1
-; IS__CGSCC_NPM-NEXT:    [[U_PRIV_0_1:%.*]] = getelementptr [[STRUCT_MYSTR]], %struct.MYstr* [[U_PRIV]], i32 0, i32 1
-; IS__CGSCC_NPM-NEXT:    store i32 [[TMP1]], i32* [[U_PRIV_0_1]], align 4
-; IS__CGSCC_NPM-NEXT:    [[Z:%.*]] = getelementptr [[STRUCT_MYSTR]], %struct.MYstr* [[U_PRIV]], i32 0, i32 1
-; IS__CGSCC_NPM-NEXT:    store i32 99, i32* [[Z]], align 4
-; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = getelementptr [[STRUCT_MYSTR]], %struct.MYstr* [[U_PRIV]], i32 0, i32 1
-; IS__CGSCC_NPM-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
-; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_MYSTR]], %struct.MYstr* [[U_PRIV]], i32 0, i32 0
-; IS__CGSCC_NPM-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 8
-; IS__CGSCC_NPM-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
-; IS__CGSCC_NPM-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP3]]
-; IS__CGSCC_NPM-NEXT:    ret i32 [[TMP7]]
+; IS________NPM-LABEL: define {{[^@]+}}@vfu2_v2
+; IS________NPM-SAME: (i8 [[TMP0:%.*]], i32 [[TMP1:%.*]])
+; IS________NPM-NEXT:  entry:
+; IS________NPM-NEXT:    [[U_PRIV:%.*]] = alloca [[STRUCT_MYSTR:%.*]], align 8
+; IS________NPM-NEXT:    [[U_PRIV_CAST:%.*]] = bitcast %struct.MYstr* [[U_PRIV]] to i8*
+; IS________NPM-NEXT:    store i8 [[TMP0]], i8* [[U_PRIV_CAST]], align 1
+; IS________NPM-NEXT:    [[U_PRIV_0_1:%.*]] = getelementptr [[STRUCT_MYSTR]], %struct.MYstr* [[U_PRIV]], i32 0, i32 1
+; IS________NPM-NEXT:    store i32 [[TMP1]], i32* [[U_PRIV_0_1]], align 4
+; IS________NPM-NEXT:    [[Z:%.*]] = getelementptr [[STRUCT_MYSTR]], %struct.MYstr* [[U_PRIV]], i32 0, i32 1
+; IS________NPM-NEXT:    store i32 99, i32* [[Z]], align 4
+; IS________NPM-NEXT:    [[TMP2:%.*]] = getelementptr [[STRUCT_MYSTR]], %struct.MYstr* [[U_PRIV]], i32 0, i32 1
+; IS________NPM-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+; IS________NPM-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_MYSTR]], %struct.MYstr* [[U_PRIV]], i32 0, i32 0
+; IS________NPM-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 8
+; IS________NPM-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
+; IS________NPM-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP3]]
+; IS________NPM-NEXT:    ret i32 [[TMP7]]
 ;
 entry:
   %z = getelementptr %struct.MYstr, %struct.MYstr* %u, i32 0, i32 1

diff  --git a/llvm/test/Transforms/Attributor/value-simplify.ll b/llvm/test/Transforms/Attributor/value-simplify.ll
index 3b92a326849d..915591114c26 100644
--- a/llvm/test/Transforms/Attributor/value-simplify.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify.ll
@@ -333,7 +333,7 @@ define internal void @test_byval(%struct.X* byval %a) {
 ;
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@test_byval
 ; IS__CGSCC_NPM-SAME: (i8* nocapture nofree readnone [[TMP0:%.*]])
-; IS__CGSCC_NPM-NEXT:    [[A_PRIV:%.*]] = alloca [[STRUCT_X:%.*]]
+; IS__CGSCC_NPM-NEXT:    [[A_PRIV:%.*]] = alloca [[STRUCT_X:%.*]], align 8
 ; IS__CGSCC_NPM-NEXT:    [[A_PRIV_CAST:%.*]] = bitcast %struct.X* [[A_PRIV]] to i8**
 ; IS__CGSCC_NPM-NEXT:    store i8* [[TMP0]], i8** [[A_PRIV_CAST]], align 8
 ; IS__CGSCC_NPM-NEXT:    [[G0:%.*]] = getelementptr [[STRUCT_X]], %struct.X* [[A_PRIV]], i32 0, i32 0

diff  --git a/llvm/test/Transforms/NewGVN/pr33367.ll b/llvm/test/Transforms/NewGVN/pr33367.ll
index 6d24e1c19682..f4d49cd33997 100644
--- a/llvm/test/Transforms/NewGVN/pr33367.ll
+++ b/llvm/test/Transforms/NewGVN/pr33367.ll
@@ -9,7 +9,7 @@ declare i64 @llvm.x86.bmi.bextr.64(i64, i64) #3
 define %MNR_struct @f000316011717_2(%DS_struct* %pDS, [64 x i64]* %pCG) #2 {
 ; CHECK-LABEL: @f000316011717_2(
 ; CHECK-NEXT:  Entry:
-; CHECK-NEXT:    [[RESTART:%.*]] = alloca [[MNR_STRUCT:%.*]]
+; CHECK-NEXT:    [[RESTART:%.*]] = alloca [[MNR_STRUCT:%.*]], align 8
 ; CHECK-NEXT:    [[PCARRY:%.*]] = getelementptr [[DS_STRUCT:%.*]], %DS_struct* [[PDS:%.*]], i32 0, i32 1
 ; CHECK-NEXT:    [[PBRBASE:%.*]] = getelementptr [[DS_STRUCT]], %DS_struct* [[PDS]], i32 0, i32 0
 ; CHECK-NEXT:    [[PBASE:%.*]] = getelementptr [32 x i64*], [32 x i64*]* [[PBRBASE]], i64 0, i64 0

diff  --git a/llvm/test/Transforms/SROA/alignment.ll b/llvm/test/Transforms/SROA/alignment.ll
index f411ed06578d..4b43619eba95 100644
--- a/llvm/test/Transforms/SROA/alignment.ll
+++ b/llvm/test/Transforms/SROA/alignment.ll
@@ -133,8 +133,8 @@ define void @test6() {
 ; We should set the alignment on all load and store operations; make sure
 ; we choose an appropriate alignment.
 ; CHECK-LABEL: @test6(
-; CHECK: alloca double{{$}}
-; CHECK: alloca double{{$}}
+; CHECK: alloca double, align 8{{$}}
+; CHECK: alloca double, align 8{{$}}
 ; CHECK: store{{.*}}, align 8
 ; CHECK: load{{.*}}, align 8
 ; CHECK: store{{.*}}, align 8

diff  --git a/llvm/test/Transforms/SROA/pointer-offset-size.ll b/llvm/test/Transforms/SROA/pointer-offset-size.ll
index c632c37988b0..caad3c22eb58 100644
--- a/llvm/test/Transforms/SROA/pointer-offset-size.ll
+++ b/llvm/test/Transforms/SROA/pointer-offset-size.ll
@@ -8,10 +8,10 @@ target datalayout = "e-p:64:64:64:32"
 define i16 @test(%struct.test* %ts2.i) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[S_SROA_0:%.*]] = alloca [3 x i8], align 2
+; CHECK-NEXT:    [[S_SROA_0:%.*]] = alloca [3 x i8], align 8
 ; CHECK-NEXT:    [[S_SROA_0_0__SROA_CAST:%.*]] = bitcast %struct.test* [[TS2_I:%.*]] to i8*
 ; CHECK-NEXT:    [[S_SROA_0_0__SROA_IDX:%.*]] = getelementptr inbounds [3 x i8], [3 x i8]* [[S_SROA_0]], i32 0, i32 0
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 [[S_SROA_0_0__SROA_CAST]], i8* align 2 [[S_SROA_0_0__SROA_IDX]], i32 3, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 [[S_SROA_0_0__SROA_CAST]], i8* align 8 [[S_SROA_0_0__SROA_IDX]], i32 3, i1 false)
 ; CHECK-NEXT:    [[X1_I_I:%.*]] = getelementptr inbounds [[STRUCT_TEST:%.*]], %struct.test* [[TS2_I]], i32 0, i32 0, i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[X1_I_I]]
 ; CHECK-NEXT:    ret i16 [[TMP0]]

diff  --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index 3939917da32e..9a7836cd558c 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -225,7 +225,7 @@ def LLVM_AllocaOp :
     if ($alignment.hasValue()) {
       auto align = $alignment.getValue().getZExtValue();
       if (align != 0)
-        alloca->setAlignment(llvm::MaybeAlign(align));
+        alloca->setAlignment(llvm::Align(align));
     }
     $res = alloca;
   }];

diff  --git a/polly/lib/CodeGen/BlockGenerators.cpp b/polly/lib/CodeGen/BlockGenerators.cpp
index d844f8714051..5a64cc86ea1d 100644
--- a/polly/lib/CodeGen/BlockGenerators.cpp
+++ b/polly/lib/CodeGen/BlockGenerators.cpp
@@ -511,8 +511,9 @@ Value *BlockGenerator::getOrCreateAlloca(const ScopArrayInfo *Array) {
 
   const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout();
 
-  Addr = new AllocaInst(Ty, DL.getAllocaAddrSpace(),
-                        ScalarBase->getName() + NameExt);
+  Addr =
+      new AllocaInst(Ty, DL.getAllocaAddrSpace(), nullptr,
+                     DL.getPrefTypeAlign(Ty), ScalarBase->getName() + NameExt);
   EntryBB = &Builder.GetInsertBlock()->getParent()->getEntryBlock();
   Addr->insertBefore(&*EntryBB->getFirstInsertionPt());
 

diff  --git a/polly/lib/CodeGen/IslNodeBuilder.cpp b/polly/lib/CodeGen/IslNodeBuilder.cpp
index 9f7e01b76b1f..a329661feddf 100644
--- a/polly/lib/CodeGen/IslNodeBuilder.cpp
+++ b/polly/lib/CodeGen/IslNodeBuilder.cpp
@@ -1395,8 +1395,8 @@ bool IslNodeBuilder::preloadInvariantEquivClass(
 
   BasicBlock *EntryBB = &Builder.GetInsertBlock()->getParent()->getEntryBlock();
   auto *Alloca = new AllocaInst(AccInstTy, DL.getAllocaAddrSpace(),
-                                AccInst->getName() + ".preload.s2a");
-  Alloca->insertBefore(&*EntryBB->getFirstInsertionPt());
+                                AccInst->getName() + ".preload.s2a",
+                                &*EntryBB->getFirstInsertionPt());
   Builder.CreateStore(PreloadVal, Alloca);
   ValueMapT PreloadedPointer;
   PreloadedPointer[PreloadVal] = AccInst;
@@ -1496,8 +1496,8 @@ void IslNodeBuilder::allocateNewArrays(BBPair StartExitBlocks) {
 
       auto *CreatedArray = new AllocaInst(NewArrayType, DL.getAllocaAddrSpace(),
                                           SAI->getName(), &*InstIt);
-      CreatedArray->setAlignment(
-          MaybeAlign(PollyTargetFirstLevelCacheLineSize));
+      if (PollyTargetFirstLevelCacheLineSize)
+        CreatedArray->setAlignment(Align(PollyTargetFirstLevelCacheLineSize));
       SAI->setBasePtr(CreatedArray);
     }
   }


        


More information about the Mlir-commits mailing list