[llvm] 3890a3b - [AMDGPU] Use SSAUpdater in PromoteAlloca

via llvm-commits llvm-commits at lists.llvm.org
Mon Jul 24 22:44:54 PDT 2023


Author: pvanhout
Date: 2023-07-25T07:44:47+02:00
New Revision: 3890a3b11398a58811098ff4d3a7ff949fea1143

URL: https://github.com/llvm/llvm-project/commit/3890a3b11398a58811098ff4d3a7ff949fea1143
DIFF: https://github.com/llvm/llvm-project/commit/3890a3b11398a58811098ff4d3a7ff949fea1143.diff

LOG: [AMDGPU] Use SSAUpdater in PromoteAlloca

This allows PromoteAlloca to not be reliant on a second SROA run to remove the alloca completely. It just does the full transformation directly.

Note PromoteAlloca is still reliant on SROA running first to
canonicalize the IR. For instance, PromoteAlloca will no longer handle aggregate types because those should be simplified by SROA before reaching the pass.

Reviewed By: #amdgpu, arsenm

Differential Revision: https://reviews.llvm.org/D152706

Added: 
    llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
    llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
    llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
    llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 0b57e1387a2adb..a1e092b19b5030 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -28,7 +28,10 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/InstSimplifyFolder.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IRBuilder.h"
@@ -38,6 +41,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 
 #define DEBUG_TYPE "amdgpu-promote-alloca"
 
@@ -45,20 +49,20 @@ using namespace llvm;
 
 namespace {
 
-static cl::opt<bool> DisablePromoteAllocaToVector(
-  "disable-promote-alloca-to-vector",
-  cl::desc("Disable promote alloca to vector"),
-  cl::init(false));
+static cl::opt<bool>
+    DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
+                                 cl::desc("Disable promote alloca to vector"),
+                                 cl::init(false));
 
-static cl::opt<bool> DisablePromoteAllocaToLDS(
-  "disable-promote-alloca-to-lds",
-  cl::desc("Disable promote alloca to LDS"),
-  cl::init(false));
+static cl::opt<bool>
+    DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
+                              cl::desc("Disable promote alloca to LDS"),
+                              cl::init(false));
 
 static cl::opt<unsigned> PromoteAllocaToVectorLimit(
-  "amdgpu-promote-alloca-to-vector-limit",
-  cl::desc("Maximum byte size to consider promote alloca to vector"),
-  cl::init(0));
+    "amdgpu-promote-alloca-to-vector-limit",
+    cl::desc("Maximum byte size to consider promote alloca to vector"),
+    cl::init(0));
 
 // Shared implementation which can do both promotion to vector and to LDS.
 class AMDGPUPromoteAllocaImpl {
@@ -80,17 +84,16 @@ class AMDGPUPromoteAllocaImpl {
 
   /// BaseAlloca is the alloca root the search started from.
   /// Val may be that alloca or a recursive user of it.
-  bool collectUsesWithPtrTypes(Value *BaseAlloca,
-                               Value *Val,
-                               std::vector<Value*> &WorkList) const;
+  bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val,
+                               std::vector<Value *> &WorkList) const;
 
   /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
   /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
   /// Returns true if both operands are derived from the same alloca. Val should
   /// be the same value as one of the input operands of UseInst.
   bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
-                                       Instruction *UseInst,
-                                       int OpIdx0, int OpIdx1) const;
+                                       Instruction *UseInst, int OpIdx0,
+                                       int OpIdx1) const;
 
   /// Check whether we have enough local memory for promotion.
   bool hasSufficientLocalMem(const Function &F);
@@ -253,6 +256,10 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
       Changed = true;
   }
 
+  // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
+  // dangling pointers. If we want to reuse it past this point, the loop above
+  // would need to be updated to remove successfully promoted allocas.
+
   return Changed;
 }
 
@@ -269,6 +276,10 @@ static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI,
   using namespace PatternMatch;
   // For now we only care about non-volatile memsets that affect the whole type
   // (start at index 0 and fill the whole alloca).
+  //
+  // TODO: Now that we moved to PromoteAlloca we could handle any memsets
+  // (except maybe volatile ones?) - we just need to use shufflevector if it
+  // only affects a subset of the vector.
   const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType());
   return I->getOperand(0) == AI &&
          match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
@@ -319,6 +330,200 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
   return ConstantInt::get(GEP->getContext(), Quot);
 }
 
+/// Promotes a single user of the alloca to a vector form.
+///
+/// \param Inst           Instruction to be promoted.
+/// \param DL             Module Data Layout.
+/// \param VectorTy       Vectorized Type.
+/// \param VecStoreSize   Size of \p VectorTy in bytes.
+/// \param ElementSize    Size of \p VectorTy element type in bytes.
+/// \param TransferInfo   MemTransferInst info map.
+/// \param GEPVectorIdx   GEP -> VectorIdx cache.
+/// \param CurVal         Current value of the vector (e.g. last stored value)
+/// \param[out]  DeferredLoads \p Inst is added to this vector if it can't
+///              be promoted now. This happens when promoting requires \p
+///              CurVal, but \p CurVal is nullptr.
+/// \return the stored value if \p Inst would have written to the alloca, or
+///         nullptr otherwise.
+static Value *promoteAllocaUserToVector(
+    Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy,
+    unsigned VecStoreSize, unsigned ElementSize,
+    DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo,
+    std::map<GetElementPtrInst *, Value *> &GEPVectorIdx, Value *CurVal,
+    SmallVectorImpl<LoadInst *> &DeferredLoads) {
+  // Note: we use InstSimplifyFolder because it can leverage the DataLayout
+  // to do more folding, especially in the case of vector splats.
+  IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
+                                        InstSimplifyFolder(DL));
+  Builder.SetInsertPoint(Inst);
+
+  const auto GetOrLoadCurrentVectorValue = [&]() -> Value * {
+    if (CurVal)
+      return CurVal;
+
+    // If the current value is not known, insert a dummy load and lower it on
+    // the second pass.
+    LoadInst *Dummy =
+        Builder.CreateLoad(VectorTy, PoisonValue::get(Builder.getPtrTy()),
+                           "promotealloca.dummyload");
+    DeferredLoads.push_back(Dummy);
+    return Dummy;
+  };
+
+  const auto CreateTempPtrIntCast =
+      [&Builder, VecStoreSize](Value *Val, Type *PtrTy) -> Value * {
+    const unsigned TempIntSize = (VecStoreSize * 8);
+    if (!PtrTy->isVectorTy())
+      return Builder.CreateBitOrPointerCast(Val,
+                                            Builder.getIntNTy(TempIntSize));
+    const unsigned NumPtrElts = cast<FixedVectorType>(PtrTy)->getNumElements();
+    // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to
+    // first cast the ptr vector to <2 x i64>.
+    assert(alignTo(TempIntSize, NumPtrElts) == TempIntSize &&
+           "Vector size not divisble");
+    Type *EltTy = Builder.getIntNTy(TempIntSize / NumPtrElts);
+    return Builder.CreateBitOrPointerCast(
+        Val, FixedVectorType::get(EltTy, NumPtrElts));
+  };
+
+  Type *VecEltTy = VectorTy->getElementType();
+  switch (Inst->getOpcode()) {
+  case Instruction::Load: {
+    // Loads can only be lowered if the value is known.
+    if (!CurVal) {
+      DeferredLoads.push_back(cast<LoadInst>(Inst));
+      return nullptr;
+    }
+
+    Value *Index = calculateVectorIndex(
+        cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx);
+
+    // We're loading the full vector.
+    if (DL.getTypeStoreSize(Inst->getType()) == VecStoreSize) {
+      assert(cast<Constant>(Index)->isZeroValue());
+      Type *InstTy = Inst->getType();
+      if (InstTy->isPtrOrPtrVectorTy())
+        CurVal = CreateTempPtrIntCast(CurVal, InstTy);
+      Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, InstTy);
+      Inst->replaceAllUsesWith(NewVal);
+      return nullptr;
+    }
+
+    // We're loading one element.
+    Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index);
+    if (Inst->getType() != VecEltTy)
+      ExtractElement =
+          Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType());
+
+    Inst->replaceAllUsesWith(ExtractElement);
+    return nullptr;
+  }
+  case Instruction::Store: {
+    // For stores, it's a bit trickier and it depends on whether we're storing
+    // the full vector or not. If we're storing the full vector, we don't need
+    // to know the current value. If this is a store of a single element, we
+    // need to know the value.
+    StoreInst *SI = cast<StoreInst>(Inst);
+    Value *Index = calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx);
+    Value *Val = SI->getValueOperand();
+
+    // We're storing the full vector, we can handle this without knowing CurVal.
+    if (DL.getTypeStoreSize(Val->getType()) == VecStoreSize) {
+      assert(cast<Constant>(Index)->isZeroValue());
+      Type *SrcTy = Val->getType();
+      if (SrcTy->isPtrOrPtrVectorTy())
+        Val = CreateTempPtrIntCast(Val, SrcTy);
+      return Builder.CreateBitOrPointerCast(Val, VectorTy);
+    }
+
+    if (Val->getType() != VecEltTy)
+      Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
+    return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val,
+                                       Index);
+  }
+  case Instruction::Call: {
+    if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
+      // For memcpy, we need to know curval.
+      ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
+      unsigned NumCopied = Length->getZExtValue() / ElementSize;
+      MemTransferInfo *TI = &TransferInfo[MTI];
+      unsigned SrcBegin = TI->SrcIndex->getZExtValue();
+      unsigned DestBegin = TI->DestIndex->getZExtValue();
+
+      SmallVector<int> Mask;
+      for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
+        if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
+          Mask.push_back(SrcBegin++);
+        } else {
+          Mask.push_back(Idx);
+        }
+      }
+
+      return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask);
+    }
+
+    if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
+      // For memset, we don't need to know the previous value because we
+      // currently only allow memsets that cover the whole alloca.
+      Value *Elt = MSI->getOperand(1);
+      if (DL.getTypeStoreSize(VecEltTy) > 1) {
+        Value *EltBytes =
+            Builder.CreateVectorSplat(DL.getTypeStoreSize(VecEltTy), Elt);
+        Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
+      }
+
+      return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt);
+    }
+
+    llvm_unreachable("Unsupported call when promoting alloca to vector");
+  }
+
+  default:
+    llvm_unreachable("Inconsistency in instructions promotable to vector");
+  }
+
+  llvm_unreachable("Did not return after promoting instruction!");
+}
+
+/// Iterates over an instruction worklist that may contain multiple instructions
+/// from the same basic block, but in a 
diff erent order.
+template <typename InstContainer>
+static void forEachWorkListItem(const InstContainer &WorkList,
+                                std::function<void(Instruction *)> Fn) {
+  // Bucket up uses of the alloca by the block they occur in.
+  // This is important because we have to handle multiple defs/uses in a block
+  // ourselves: SSAUpdater is purely for cross-block references.
+  DenseMap<BasicBlock *, SmallDenseSet<Instruction *>> UsesByBlock;
+  for (Instruction *User : WorkList)
+    UsesByBlock[User->getParent()].insert(User);
+
+  for (Instruction *User : WorkList) {
+    BasicBlock *BB = User->getParent();
+    auto &BlockUses = UsesByBlock[BB];
+
+    // Already processed, skip.
+    if (BlockUses.empty())
+      continue;
+
+    // Only user in the block, directly process it.
+    if (BlockUses.size() == 1) {
+      Fn(User);
+      continue;
+    }
+
+    // Multiple users in the block, do a linear scan to see users in order.
+    for (Instruction &Inst : *BB) {
+      if (!BlockUses.contains(&Inst))
+        continue;
+
+      Fn(&Inst);
+    }
+
+    // Clear the block so we know it's been processed.
+    BlockUses.clear();
+  }
+}
+
 // FIXME: Should try to pick the most likely to be profitable allocas first.
 bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
@@ -365,6 +570,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
 
   std::map<GetElementPtrInst *, Value *> GEPVectorIdx;
   SmallVector<Instruction *> WorkList;
+  SmallVector<Instruction *> UsersToRemove;
   SmallVector<Instruction *> DeferredInsts;
   SmallVector<Use *, 8> Uses;
   DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
@@ -393,12 +599,18 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
         return RejectUser(Inst, "pointer is being stored");
 
       Type *AccessTy = getLoadStoreType(Inst);
+      if (AccessTy->isAggregateType())
+        return RejectUser(Inst, "unsupported load/store as aggregate");
+      assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
+
       Ptr = Ptr->stripPointerCasts();
 
-      // Alloca already accessed as vector, leave alone.
+      // Alloca already accessed as vector.
       if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) ==
-                                DL->getTypeStoreSize(AccessTy))
+                                DL->getTypeStoreSize(AccessTy)) {
+        WorkList.push_back(Inst);
         continue;
+      }
 
       // Check that this is a simple access of a vector element.
       bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
@@ -416,6 +628,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       // Look through bitcasts.
       for (Use &U : Inst->uses())
         Uses.push_back(&U);
+      UsersToRemove.push_back(Inst);
       continue;
     }
 
@@ -429,6 +642,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       GEPVectorIdx[GEP] = Index;
       for (Use &U : Inst->uses())
         Uses.push_back(&U);
+      UsersToRemove.push_back(Inst);
       continue;
     }
 
@@ -481,13 +695,17 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
     }
 
     // Ignore assume-like intrinsics and comparisons used in assumes.
-    if (isAssumeLikeIntrinsic(Inst))
+    if (isAssumeLikeIntrinsic(Inst)) {
+      UsersToRemove.push_back(Inst);
       continue;
+    }
 
     if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
           return isAssumeLikeIntrinsic(cast<Instruction>(U));
-        }))
+        })) {
+      UsersToRemove.push_back(Inst);
       continue;
+    }
 
     return RejectUser(Inst, "unhandled alloca user");
   }
@@ -505,75 +723,60 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
 
   LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
                     << *VectorTy << '\n');
+  const unsigned VecStoreSize = DL->getTypeStoreSize(VectorTy);
+
+  // Alloca is uninitialized memory. Imitate that by making the first value
+  // undef.
+  SSAUpdater Updater;
+  Updater.Initialize(VectorTy, "promotealloca");
+  Updater.AddAvailableValue(Alloca.getParent(), UndefValue::get(VectorTy));
+
+  // First handle the initial worklist.
+  SmallVector<LoadInst *, 4> DeferredLoads;
+  forEachWorkListItem(WorkList, [&](Instruction *I) {
+    BasicBlock *BB = I->getParent();
+    // On the first pass, we only take values that are trivially known, i.e.
+    // where AddAvailableValue was already called in this block.
+    Value *Result = promoteAllocaUserToVector(
+        I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
+        Updater.FindValueForBlock(BB), DeferredLoads);
+    if (Result)
+      Updater.AddAvailableValue(BB, Result);
+  });
+
+  // Then handle deferred loads.
+  forEachWorkListItem(DeferredLoads, [&](Instruction *I) {
+    SmallVector<LoadInst *, 0> NewDLs;
+    BasicBlock *BB = I->getParent();
+    // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always
+    // get a value, inserting PHIs as needed.
+    Value *Result = promoteAllocaUserToVector(
+        I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
+        Updater.GetValueInMiddleOfBlock(I->getParent()), NewDLs);
+    if (Result)
+      Updater.AddAvailableValue(BB, Result);
+    assert(NewDLs.empty() && "No more deferred loads should be queued!");
+  });
+
+  // Delete all instructions. On the first pass, new dummy loads may have been
+  // added so we need to collect them too.
+  DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end());
+  InstsToDelete.insert(DeferredLoads.begin(), DeferredLoads.end());
+  for (Instruction *I : InstsToDelete) {
+    assert(I->use_empty());
+    I->eraseFromParent();
+  }
 
-  for (Instruction *Inst : WorkList) {
-    IRBuilder<> Builder(Inst);
-    switch (Inst->getOpcode()) {
-    case Instruction::Load: {
-      Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
-      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
-      Value *VecValue =
-          Builder.CreateAlignedLoad(VectorTy, &Alloca, Alloca.getAlign());
-      Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
-      if (Inst->getType() != VecEltTy)
-        ExtractElement =
-            Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType());
-      Inst->replaceAllUsesWith(ExtractElement);
-      Inst->eraseFromParent();
-      break;
-    }
-    case Instruction::Store: {
-      StoreInst *SI = cast<StoreInst>(Inst);
-      Value *Ptr = SI->getPointerOperand();
-      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
-      Value *VecValue =
-          Builder.CreateAlignedLoad(VectorTy, &Alloca, Alloca.getAlign());
-      Value *Elt = SI->getValueOperand();
-      if (Elt->getType() != VecEltTy)
-        Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy);
-      Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index);
-      Builder.CreateAlignedStore(NewVecValue, &Alloca, Alloca.getAlign());
-      Inst->eraseFromParent();
-      break;
-    }
-    case Instruction::Call: {
-      if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst)) {
-        ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
-        unsigned NumCopied = Length->getZExtValue() / ElementSize;
-        MemTransferInfo *TI = &TransferInfo[cast<MemTransferInst>(Inst)];
-        unsigned SrcBegin = TI->SrcIndex->getZExtValue();
-        unsigned DestBegin = TI->DestIndex->getZExtValue();
-
-        SmallVector<int> Mask;
-        for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
-          if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
-            Mask.push_back(SrcBegin++);
-          } else {
-            Mask.push_back(Idx);
-          }
-        }
-        Value *VecValue =
-            Builder.CreateAlignedLoad(VectorTy, &Alloca, Alloca.getAlign());
-        Value *NewVecValue = Builder.CreateShuffleVector(VecValue, Mask);
-        Builder.CreateAlignedStore(NewVecValue, &Alloca, Alloca.getAlign());
-
-        Inst->eraseFromParent();
-      } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
-        // Ensure the length parameter of the memsets matches the new vector
-        // type's. In general, the type size shouldn't change so this is a
-        // no-op, but it's better to be safe.
-        MSI->setOperand(2, Builder.getInt64(DL->getTypeStoreSize(VectorTy)));
-      } else {
-        llvm_unreachable("Unsupported call when promoting alloca to vector");
-      }
-      break;
-    }
-
-    default:
-      llvm_unreachable("Inconsistency in instructions promotable to vector");
-    }
+  // Delete all the users that are known to be removeable.
+  for (Instruction *I : reverse(UsersToRemove)) {
+    I->dropDroppableUses();
+    assert(I->use_empty());
+    I->eraseFromParent();
   }
 
+  // Alloca should now be dead too.
+  assert(Alloca.use_empty());
+  Alloca.eraseFromParent();
   return true;
 }
 
@@ -1061,7 +1264,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
 
   CurrentLocalMemUsage = NewSize;
 
-  std::vector<Value*> WorkList;
+  std::vector<Value *> WorkList;
 
   if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
     LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
@@ -1204,10 +1407,9 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
     assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
 
     MemTransferInst *MI = cast<MemTransferInst>(Intr);
-    auto *B =
-      Builder.CreateMemTransferInst(ID, MI->getRawDest(), MI->getDestAlign(),
-                                    MI->getRawSource(), MI->getSourceAlign(),
-                                    MI->getLength(), MI->isVolatile());
+    auto *B = Builder.CreateMemTransferInst(
+        ID, MI->getRawDest(), MI->getDestAlign(), MI->getRawSource(),
+        MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
 
     for (unsigned I = 0; I != 2; ++I) {
       if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {

diff  --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
index 8e0750195b3b4d..f440b1f9ee6f7b 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
@@ -13,37 +13,20 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s16, s33
-; GCN-NEXT:    s_add_i32 s33, s32, 0xfc0
-; GCN-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
-; GCN-NEXT:    s_addk_i32 s32, 0x3000
+; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    v_writelane_b32 v42, s16, 2
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, extern_func at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, extern_func at gotpcrel32@hi+12
 ; GCN-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    v_writelane_b32 v42, s30, 0
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:92
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:88
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:84
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:80
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:76
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:72
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:68
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:64
-; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, v8
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    v_writelane_b32 v42, s31, 1
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
@@ -57,9 +40,9 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 {
 ; GCN-NEXT:    v_readlane_b32 s30, v42, 0
 ; GCN-NEXT:    v_readlane_b32 s4, v42, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    s_addk_i32 s32, 0xd000
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
index 785ac5ad51a048..68737cb227a00e 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=sroa,amdgpu-promote-alloca < %s | FileCheck %s
 
 ; Make sure that array alloca loaded and stored as multi-element aggregates are handled correctly
 ; Strictly the promote-alloca pass shouldn't have to deal with this case as it is non-canonical, but
@@ -16,19 +16,16 @@
 
 define amdgpu_vs void @promote_1d_aggr() #0 {
 ; CHECK-LABEL: @promote_1d_aggr(
-; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
 ; CHECK-NEXT:    [[F1:%.*]] = alloca [1 x float], align 4, addrspace(5)
 ; CHECK-NEXT:    [[FOO:%.*]] = getelementptr [[BLOCK:%.*]], ptr addrspace(1) @block, i32 0, i32 1
 ; CHECK-NEXT:    [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
-; CHECK-NEXT:    store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = load [1 x float], ptr addrspace(1) @block, align 4
-; CHECK-NEXT:    store [1 x float] [[FOO3]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(5) [[I]], align 4
-; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
+; CHECK-NEXT:    [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [1 x float] [[FOO3]], 0
+; CHECK-NEXT:    [[FOO3_FCA_0_GEP:%.*]] = getelementptr inbounds [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 0
+; CHECK-NEXT:    store float [[FOO3_FCA_0_EXTRACT]], ptr addrspace(5) [[FOO3_FCA_0_GEP]], align 4
+; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO1]]
 ; CHECK-NEXT:    [[FOO6:%.*]] = load float, ptr addrspace(5) [[FOO5]], align 4
-; CHECK-NEXT:    [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5)
-; CHECK-NEXT:    [[FOO8:%.*]] = load <4 x float>, ptr addrspace(5) [[FOO7]], align 16
-; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[FOO6]], i32 0
+; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> undef, float [[FOO6]], i32 0
 ; CHECK-NEXT:    [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[FOO6]], i32 1
 ; CHECK-NEXT:    [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[FOO6]], i32 2
 ; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[FOO6]], i32 3
@@ -60,22 +57,12 @@ define amdgpu_vs void @promote_1d_aggr() #0 {
 
 define amdgpu_vs void @promote_store_aggr() #0 {
 ; CHECK-LABEL: @promote_store_aggr(
-; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
-; CHECK-NEXT:    [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5)
 ; CHECK-NEXT:    [[FOO1:%.*]] = load i32, ptr addrspace(1) @block2, align 4
-; CHECK-NEXT:    store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4
-; CHECK-NEXT:    [[FOO2:%.*]] = load i32, ptr addrspace(5) [[I]], align 4
-; CHECK-NEXT:    [[FOO3:%.*]] = sitofp i32 [[FOO2]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3]], i32 0
-; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [2 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float 2.000000e+00, i64 1
-; CHECK-NEXT:    store <2 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO6:%.*]] = load [2 x float], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[FOO3:%.*]] = sitofp i32 [[FOO1]] to float
+; CHECK-NEXT:    [[FOO6_FCA_0_INSERT:%.*]] = insertvalue [2 x float] poison, float [[FOO3]], 0
+; CHECK-NEXT:    [[FOO6_FCA_1_INSERT:%.*]] = insertvalue [2 x float] [[FOO6_FCA_0_INSERT]], float 2.000000e+00, 1
 ; CHECK-NEXT:    [[FOO7:%.*]] = getelementptr [[BLOCK2:%.*]], ptr addrspace(1) @block2, i32 0, i32 1
-; CHECK-NEXT:    store [2 x float] [[FOO6]], ptr addrspace(1) [[FOO7]], align 4
+; CHECK-NEXT:    store [2 x float] [[FOO6_FCA_1_INSERT]], ptr addrspace(1) [[FOO7]], align 4
 ; CHECK-NEXT:    store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, ptr addrspace(1) @pv, align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -100,23 +87,18 @@ define amdgpu_vs void @promote_store_aggr() #0 {
 
 define amdgpu_vs void @promote_load_from_store_aggr() #0 {
 ; CHECK-LABEL: @promote_load_from_store_aggr(
-; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
-; CHECK-NEXT:    [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5)
 ; CHECK-NEXT:    [[FOO:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 1
 ; CHECK-NEXT:    [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
-; CHECK-NEXT:    store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = load [2 x float], ptr addrspace(1) @block3, align 4
-; CHECK-NEXT:    store [2 x float] [[FOO3]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(5) [[I]], align 4
-; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [2 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 [[FOO4]]
-; CHECK-NEXT:    [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5)
-; CHECK-NEXT:    [[FOO8:%.*]] = load <4 x float>, ptr addrspace(5) [[FOO7]], align 16
-; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[TMP2]], i32 0
-; CHECK-NEXT:    [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP2]], i32 2
-; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP2]], i32 3
+; CHECK-NEXT:    [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float [[FOO3_FCA_0_EXTRACT]], i32 0
+; CHECK-NEXT:    [[FOO3_FCA_1_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3_FCA_1_EXTRACT]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO1]]
+; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
+; CHECK-NEXT:    [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP3]], i32 3
 ; CHECK-NEXT:    store <4 x float> [[FOO12]], ptr addrspace(1) @pv, align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -142,22 +124,7 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 {
 
 define amdgpu_vs void @promote_memmove_aggr() #0 {
 ; CHECK-LABEL: @promote_memmove_aggr(
-; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
-; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO1:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 1.000000e+00, i64 1
-; CHECK-NEXT:    store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
-; CHECK-NEXT:    [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 2.000000e+00, i64 3
-; CHECK-NEXT:    store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 4>
-; CHECK-NEXT:    store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0
-; CHECK-NEXT:    store float [[TMP8]], ptr addrspace(1) @pv, align 4
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) @pv, align 4
 ; CHECK-NEXT:    ret void
 ;
   %f1 = alloca [5 x float], addrspace(5)
@@ -174,24 +141,12 @@ define amdgpu_vs void @promote_memmove_aggr() #0 {
 
 define amdgpu_vs void @promote_memcpy_aggr() #0 {
 ; CHECK-LABEL: @promote_memcpy_aggr(
-; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
-; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
-; CHECK-NEXT:    [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 2.000000e+00, i64 3
-; CHECK-NEXT:    store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
 ; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
-; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 3.000000e+00, i32 [[FOO4]]
-; CHECK-NEXT:    store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
-; CHECK-NEXT:    store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0
-; CHECK-NEXT:    store float [[TMP8]], ptr addrspace(1) @pv, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <5 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 2.000000e+00, float 0.000000e+00>, float 3.000000e+00, i32 [[FOO4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    store float [[TMP3]], ptr addrspace(1) @pv, align 4
 ; CHECK-NEXT:    ret void
 ;
   %f1 = alloca [5 x float], addrspace(5)
@@ -213,22 +168,7 @@ define amdgpu_vs void @promote_memcpy_aggr() #0 {
 
 define amdgpu_vs void @promote_memcpy_identity_aggr() #0 {
 ; CHECK-LABEL: @promote_memcpy_identity_aggr(
-; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
-; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO1:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 1.000000e+00, i64 1
-; CHECK-NEXT:    store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
-; CHECK-NEXT:    [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 2.000000e+00, i64 3
-; CHECK-NEXT:    store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4>
-; CHECK-NEXT:    store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0
-; CHECK-NEXT:    store float [[TMP8]], ptr addrspace(1) @pv, align 4
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) @pv, align 4
 ; CHECK-NEXT:    ret void
 ;
   %f1 = alloca [5 x float], addrspace(5)
@@ -248,8 +188,26 @@ define amdgpu_vs void @promote_memcpy_two_aggrs() #0 {
 ; CHECK-LABEL: @promote_memcpy_two_aggrs(
 ; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
 ; CHECK-NEXT:    [[F2:%.*]] = alloca [5 x float], align 4, addrspace(5)
-; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F2]], align 4
+; CHECK-NEXT:    [[DOTFCA_0_GEP1:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 0
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP1]], align 4
+; CHECK-NEXT:    [[DOTFCA_1_GEP2:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP2]], align 4
+; CHECK-NEXT:    [[DOTFCA_2_GEP3:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 2
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP3]], align 4
+; CHECK-NEXT:    [[DOTFCA_3_GEP4:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP4]], align 4
+; CHECK-NEXT:    [[DOTFCA_4_GEP5:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 4
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP5]], align 4
+; CHECK-NEXT:    [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 0
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP]], align 4
+; CHECK-NEXT:    [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 1
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP]], align 4
+; CHECK-NEXT:    [[DOTFCA_2_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 2
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP]], align 4
+; CHECK-NEXT:    [[DOTFCA_3_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 3
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP]], align 4
+; CHECK-NEXT:    [[DOTFCA_4_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 4
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP]], align 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
 ; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
 ; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
@@ -283,7 +241,16 @@ define amdgpu_vs void @promote_memcpy_two_aggrs() #0 {
 define amdgpu_vs void @promote_memcpy_p1p5_aggr(ptr addrspace(1) inreg %src) #0 {
 ; CHECK-LABEL: @promote_memcpy_p1p5_aggr(
 ; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
-; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 0
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP]], align 4
+; CHECK-NEXT:    [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP]], align 4
+; CHECK-NEXT:    [[DOTFCA_2_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 2
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP]], align 4
+; CHECK-NEXT:    [[DOTFCA_3_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP]], align 4
+; CHECK-NEXT:    [[DOTFCA_4_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 4
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP]], align 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
 ; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
 ; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
@@ -305,21 +272,12 @@ define amdgpu_vs void @promote_memcpy_p1p5_aggr(ptr addrspace(1) inreg %src) #0
 
 define amdgpu_vs void @promote_memcpy_inline_aggr() #0 {
 ; CHECK-LABEL: @promote_memcpy_inline_aggr(
-; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
-; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
 ; CHECK-NEXT:    [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
 ; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
-; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 3.000000e+00, i32 [[FOO4]]
-; CHECK-NEXT:    store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <5 x float> [[TMP3]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
-; CHECK-NEXT:    store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <5 x float> [[TMP5]], i32 0
-; CHECK-NEXT:    store float [[TMP6]], ptr addrspace(1) @pv, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <5 x float> zeroinitializer, float 3.000000e+00, i32 [[FOO4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    store float [[TMP3]], ptr addrspace(1) @pv, align 4
 ; CHECK-NEXT:    ret void
 ;
   %f1 = alloca [5 x float], addrspace(5)
@@ -347,30 +305,16 @@ declare void @llvm.memmove.p5i8.p5i8.i32(ptr addrspace(5) nocapture writeonly, p
 
 define amdgpu_ps void @promote_double_aggr() #0 {
 ; CHECK-LABEL: @promote_double_aggr(
-; CHECK-NEXT:    [[S:%.*]] = alloca [2 x double], align 8, addrspace(5)
 ; CHECK-NEXT:    [[FOO:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0
 ; CHECK-NEXT:    [[FOO1:%.*]] = load double, ptr addrspace(1) [[FOO]], align 8
 ; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 1
 ; CHECK-NEXT:    [[FOO3:%.*]] = load double, ptr addrspace(1) [[FOO2]], align 8
 ; CHECK-NEXT:    [[FOO4:%.*]] = insertvalue [2 x double] undef, double [[FOO1]], 0
 ; CHECK-NEXT:    [[FOO5:%.*]] = insertvalue [2 x double] [[FOO4]], double [[FOO3]], 1
-; CHECK-NEXT:    store [2 x double] [[FOO5]], ptr addrspace(5) [[S]], align 8
-; CHECK-NEXT:    [[FOO6:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i64 1
-; CHECK-NEXT:    [[FOO8:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i64 1
-; CHECK-NEXT:    [[FOO10:%.*]] = fadd double [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[FOO10]], i32 0
-; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr addrspace(5) [[S]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
-; CHECK-NEXT:    [[FOO14:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i64 1
-; CHECK-NEXT:    [[FOO16:%.*]] = fadd double [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[FOO5_FCA_0_EXTRACT:%.*]] = extractvalue [2 x double] [[FOO5]], 0
+; CHECK-NEXT:    [[FOO5_FCA_1_EXTRACT:%.*]] = extractvalue [2 x double] [[FOO5]], 1
+; CHECK-NEXT:    [[FOO10:%.*]] = fadd double [[FOO5_FCA_1_EXTRACT]], [[FOO5_FCA_1_EXTRACT]]
+; CHECK-NEXT:    [[FOO16:%.*]] = fadd double [[FOO10]], [[FOO5_FCA_1_EXTRACT]]
 ; CHECK-NEXT:    [[FOO17:%.*]] = fptrunc double [[FOO16]] to float
 ; CHECK-NEXT:    [[FOO18:%.*]] = insertelement <4 x float> undef, float [[FOO17]], i32 0
 ; CHECK-NEXT:    [[FOO19:%.*]] = insertelement <4 x float> [[FOO18]], float [[FOO17]], i32 1
@@ -410,21 +354,6 @@ define amdgpu_ps void @promote_double_aggr() #0 {
 define amdgpu_kernel void @alloca_struct() #0 {
 ; CHECK-LABEL: @alloca_struct(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load !0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1:![0-9]+]], !invariant.load !0
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2:![0-9]+]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.z(), !range [[RNG2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x [2 x %struct]], ptr addrspace(3) @alloca_struct.alloca, i32 0, i32 [[TMP13]]
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
index 3596c96b8cd79b..4cec3bd41ce2f4 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
@@ -6,7 +6,7 @@
 @global_array1 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4
 
 ; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) {
-; IR: alloca [10 x i32]
+; IR-NOT: alloca [10 x i32]
 ; ASM-LABEL: {{^}}promote_alloca_size_256:
 ; ASM: .amdgpu_lds llvm.amdgcn.kernel.promote_alloca_size_256.lds, 60000, 16
 ; ASM-NOT: .amdgpu_lds

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
new file mode 100644
index 00000000000000..0eeec70d1995b1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
@@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+define amdgpu_kernel void @test_overwrite(i64 %val, i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @test_overwrite
+; CHECK-SAME: (i64 [[VAL:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND]], label [[LOOP:%.*]], label [[END:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP2:%.*]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x i64> [[PROMOTEALLOCA]], i64 68, i32 0
+; CHECK-NEXT:    [[TMP2]] = insertelement <3 x i64> [[TMP1]], i64 32, i32 0
+; CHECK-NEXT:    [[LOOP_CC:%.*]] = icmp ne i64 [[TMP0]], 68
+; CHECK-NEXT:    br i1 [[LOOP_CC]], label [[LOOP]], label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP2]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
+; CHECK-NEXT:    ret void
+;
+entry:
+  %stack = alloca [3 x i64], align 4, addrspace(5)
+  store i64 43, ptr addrspace(5) %stack
+  br i1 %cond, label %loop, label %end
+
+loop:
+  %load.0 = load i64, ptr addrspace(5) %stack
+  store i64 68, ptr addrspace(5) %stack
+  %load.1 = load i64, ptr addrspace(5) %stack
+  store i64 32, ptr addrspace(5) %stack
+  %loop.cc = icmp ne i64 %load.0, %load.1
+  br i1 %loop.cc, label %loop, label %end
+
+end:
+  %reload = load i64, ptr addrspace(5) %stack
+  ret void
+}
+
+define amdgpu_kernel void @test_no_overwrite(i64 %val, i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @test_no_overwrite
+; CHECK-SAME: (i64 [[VAL:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND]], label [[LOOP:%.*]], label [[END:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP1:%.*]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0
+; CHECK-NEXT:    [[TMP1]] = insertelement <3 x i64> [[PROMOTEALLOCA]], i64 32, i32 1
+; CHECK-NEXT:    [[LOOP_CC:%.*]] = icmp ne i64 [[TMP0]], 32
+; CHECK-NEXT:    br i1 [[LOOP_CC]], label [[LOOP]], label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP1]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %stack = alloca [3 x i64], align 4, addrspace(5)
+  %stack.1 = getelementptr inbounds i64, ptr addrspace(5) %stack, i32 1
+  store i64 43, ptr addrspace(5) %stack
+  br i1 %cond, label %loop, label %end
+
+loop:
+  %load = load i64, ptr addrspace(5) %stack
+  store i64 32, ptr addrspace(5) %stack.1
+  %loop.cc = icmp ne i64 %load, 32
+  br i1 %loop.cc, label %loop, label %end
+
+end:
+  %reload = load i64, ptr addrspace(5) %stack
+  %reload.1 = load i64, ptr addrspace(5) %stack.1
+  ret void
+}
+
+define ptr @alloca_load_store_ptr64_full_ivec(ptr %arg) {
+; CHECK-LABEL: define ptr @alloca_load_store_ptr64_full_ivec
+; CHECK-SAME: (ptr [[ARG:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[ARG]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <8 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    ret ptr [[TMP3]]
+;
+entry:
+  %alloca = alloca [8 x i8], align 8, addrspace(5)
+  store ptr %arg, ptr addrspace(5) %alloca, align 8
+  %tmp = load ptr, ptr addrspace(5) %alloca, align 8
+  ret ptr %tmp
+}
+
+define ptr addrspace(3) @alloca_load_store_ptr32_full_ivec(ptr addrspace(3) %arg) {
+; CHECK-LABEL: define ptr addrspace(3) @alloca_load_store_ptr32_full_ivec
+; CHECK-SAME: (ptr addrspace(3) [[ARG:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(3) [[ARG]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[TMP0]] to <4 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
+; CHECK-NEXT:    ret ptr addrspace(3) [[TMP3]]
+;
+entry:
+  %alloca = alloca [4 x i8], align 8, addrspace(5)
+  store ptr addrspace(3) %arg, ptr addrspace(5) %alloca, align 8
+  %tmp = load ptr addrspace(3), ptr addrspace(5) %alloca, align 8
+  ret ptr addrspace(3) %tmp
+}
+
+define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec(<2 x ptr> %arg) {
+; CHECK-LABEL: define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec
+; CHECK-SAME: (<2 x ptr> [[ARG:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint <2 x ptr> [[ARG]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr <4 x i32> [[TMP1]] to <4 x ptr addrspace(3)>
+; CHECK-NEXT:    ret <4 x ptr addrspace(3)> [[TMP2]]
+;
+entry:
+  %alloca = alloca [4 x i32], align 8, addrspace(5)
+  store <2 x ptr> %arg, ptr addrspace(5) %alloca, align 8
+  %tmp = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca, align 8
+  ret <4 x ptr addrspace(3)> %tmp
+}
+
+; Currently rejected due to the store not being cast-able.
+; TODO: We should probably be able to vectorize this
+define void @alloca_load_store_ptr_mixed_ptrvec(<2 x ptr addrspace(3)> %arg) {
+; CHECK-LABEL: define void @alloca_load_store_ptr_mixed_ptrvec
+; CHECK-SAME: (<2 x ptr addrspace(3)> [[ARG:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [8 x i32], align 8, addrspace(5)
+; CHECK-NEXT:    store <2 x ptr addrspace(3)> [[ARG]], ptr addrspace(5) [[ALLOCA]], align 8
+; CHECK-NEXT:    [[TMP:%.*]] = load <2 x ptr addrspace(3)>, ptr addrspace(5) [[ALLOCA]], align 8
+; CHECK-NEXT:    [[TMP_FULL:%.*]] = load <4 x ptr addrspace(3)>, ptr addrspace(5) [[ALLOCA]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %alloca = alloca [8 x i32], align 8, addrspace(5)
+  store <2 x ptr addrspace(3)> %arg, ptr addrspace(5) %alloca, align 8
+  %tmp = load <2 x ptr addrspace(3)>, ptr addrspace(5) %alloca, align 8
+  %tmp.full = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca, align 8
+  ret void
+}
+
+; Will not vectorize because we're accessing a 64 bit vector with a 32 bits pointer.
+define ptr addrspace(3) @alloca_load_store_ptr_mixed_full_ivec(ptr addrspace(3) %arg) {
+; CHECK-LABEL: define ptr addrspace(3) @alloca_load_store_ptr_mixed_full_ivec
+; CHECK-SAME: (ptr addrspace(3) [[ARG:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [8 x i8], align 8, addrspace(5)
+; CHECK-NEXT:    store ptr addrspace(3) [[ARG]], ptr addrspace(5) [[ALLOCA]], align 8
+; CHECK-NEXT:    [[TMP:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[ALLOCA]], align 8
+; CHECK-NEXT:    ret ptr addrspace(3) [[TMP]]
+;
+entry:
+  %alloca = alloca [8 x i8], align 8, addrspace(5)
+  store ptr addrspace(3) %arg, ptr addrspace(5) %alloca, align 8
+  %tmp = load ptr addrspace(3), ptr addrspace(5) %alloca, align 8
+  ret ptr addrspace(3) %tmp
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
index f31421de517cb9..a99c01edcc12d3 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
@@ -1,19 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca,sroa < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
 
 ; Checks that memsets don't block PromoteAlloca.
 
-; Note: memsets are just updated with the new type size. They are not eliminated which means
-; the original alloca also stay. This puts a bit more load on SROA.
-; If PromoteAlloca is moved to SSAUpdater, we could just entirely replace the memsets with
-; e.g. ConstantAggregate.
-
 define amdgpu_kernel void @memset_all_zero(i64 %val) {
 ; CHECK-LABEL: @memset_all_zero(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <6 x i64> zeroinitializer, i64 [[VAL:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <6 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i64 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -30,8 +24,7 @@ define amdgpu_kernel void @memset_all_5(i64 %val) {
 ; CHECK-LABEL: @memset_all_5(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i64> <i64 361700864190383365, i64 361700864190383365, i64 361700864190383365, i64 361700864190383365>, i64 [[VAL:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i64 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -47,11 +40,9 @@ entry:
 define amdgpu_kernel void @memset_volatile_nopromote(i64 %val) {
 ; CHECK-LABEL: @memset_volatile_nopromote(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[STACK_SROA_0:%.*]] = alloca i64, align 8, addrspace(5)
-; CHECK-NEXT:    [[STACK_SROA_2:%.*]] = alloca [3 x i64], align 8, addrspace(5)
-; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_0]], i8 0, i64 8, i1 true)
-; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_2]], i8 0, i64 24, i1 true)
-; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK_SROA_0]], align 8
+; CHECK-NEXT:    [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 32, i1 true)
+; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -64,11 +55,9 @@ entry:
 define amdgpu_kernel void @memset_badsize_nopromote(i64 %val) {
 ; CHECK-LABEL: @memset_badsize_nopromote(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[STACK_SROA_0:%.*]] = alloca i64, align 8, addrspace(5)
-; CHECK-NEXT:    [[STACK_SROA_2:%.*]] = alloca [23 x i8], align 4, addrspace(5)
-; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_0]], i8 0, i64 8, i1 true)
-; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 4 [[STACK_SROA_2]], i8 0, i64 23, i1 true)
-; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK_SROA_0]], align 8
+; CHECK-NEXT:    [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 31, i1 true)
+; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -81,8 +70,10 @@ entry:
 define amdgpu_kernel void @memset_offset_ptr_nopromote(i64 %val) {
 ; CHECK-LABEL: @memset_offset_ptr_nopromote(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[STACK_SROA_1:%.*]] = alloca [3 x i64], align 8, addrspace(5)
-; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_1]], i8 0, i64 24, i1 true)
+; CHECK-NEXT:    [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [4 x i64], ptr addrspace(5) [[STACK]], i64 0, i64 1
+; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[GEP]], i8 0, i64 24, i1 true)
+; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll
index 70b4e94f36c07c..8df15e3f7e29a9 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll
@@ -4,15 +4,10 @@
 define i64 @test_pointer_array(i64 %v) {
 ; OPT-LABEL: @test_pointer_array(
 ; OPT-NEXT:  entry:
-; OPT-NEXT:    [[A:%.*]] = alloca [3 x ptr], align 16, addrspace(5)
-; OPT-NEXT:    [[TMP0:%.*]] = load <3 x ptr>, ptr addrspace(5) [[A]], align 16
-; OPT-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[V:%.*]] to ptr
-; OPT-NEXT:    [[TMP2:%.*]] = insertelement <3 x ptr> [[TMP0]], ptr [[TMP1]], i32 0
-; OPT-NEXT:    store <3 x ptr> [[TMP2]], ptr addrspace(5) [[A]], align 16
-; OPT-NEXT:    [[TMP3:%.*]] = load <3 x ptr>, ptr addrspace(5) [[A]], align 16
-; OPT-NEXT:    [[TMP4:%.*]] = extractelement <3 x ptr> [[TMP3]], i32 0
-; OPT-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64
-; OPT-NEXT:    ret i64 [[TMP5]]
+; OPT-NEXT:    [[TMP0:%.*]] = inttoptr i64 [[V:%.*]] to ptr
+; OPT-NEXT:    [[TMP1:%.*]] = insertelement <3 x ptr> undef, ptr [[TMP0]], i32 0
+; OPT-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; OPT-NEXT:    ret i64 [[TMP2]]
 ;
 entry:
   %a = alloca [3 x ptr], align 16, addrspace(5)

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
index adabeab3795059..083ed999ac3712 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -passes=amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -passes=sroa,amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
 
 ; GCN-LABEL: {{^}}float4_alloca_store4:
 ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4
@@ -11,11 +11,8 @@
 ; GCN: v_cndmask_b32_e32 [[RES:v[0-9]+]], 4.0,
 ; GCN: store_dword v{{.+}}, [[RES]]
 
-; OPT:  %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
-; OPT:  store <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, ptr addrspace(5) %alloca, align 4
-; OPT:  %0 = load <4 x float>, ptr addrspace(5) %alloca
-; OPT:  %1 = extractelement <4 x float> %0, i32 %sel2
-; OPT:  store float %1, ptr addrspace(1) %out, align 4
+; OPT:  %0 = extractelement <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, i32 %sel2
+; OPT:  store float %0, ptr addrspace(1) %out, align 4
 
 define amdgpu_kernel void @float4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
@@ -46,12 +43,8 @@ entry:
 ; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
 ; GCN:     store_dwordx4 v{{.+}},
 
-; OPT: %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
-; OPT: %0 = load <4 x float>, ptr addrspace(5) %alloca
-; OPT: %1 = insertelement <4 x float> %0, float 1.000000e+00, i32 %sel2
-; OPT: store <4 x float> %1, ptr addrspace(5) %alloca
-; OPT: %load = load <4 x float>, ptr addrspace(5) %alloca, align 4
-; OPT:  store <4 x float> %load, ptr addrspace(1) %out, align 4
+; OPT: %0 = insertelement <4 x float> undef, float 1.000000e+00, i32 %sel2
+; OPT: store <4 x float> %0, ptr addrspace(1) %out, align 4
 
 define amdgpu_kernel void @float4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
@@ -77,11 +70,8 @@ entry:
 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
 ; GCN:     v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]]
 
-; OPT: %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
-; OPT: store <4 x half> <half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400>, ptr addrspace(5) %alloca, align 2
-; OPT: %0 = load <4 x half>, ptr addrspace(5) %alloca
-; OPT: %1 = extractelement <4 x half> %0, i32 %sel2
-; OPT: store half %1, ptr addrspace(1) %out, align 2
+; OPT: %0 = extractelement <4 x half> <half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400>, i32 %sel2
+; OPT: store half %0, ptr addrspace(1) %out, align 2
 
 define amdgpu_kernel void @half4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
@@ -105,12 +95,8 @@ entry:
 ; GCN-NOT: buffer_
 ; GCN:     s_mov_b64 s[{{[0-9:]+}}], 0xffff
 
-; OPT: %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
-; OPT: %0 = load <4 x half>, ptr addrspace(5) %alloca
-; OPT: %1 = insertelement <4 x half> %0, half 0xH3C00, i32 %sel2
-; OPT: store <4 x half> %1, ptr addrspace(5) %alloca
-; OPT: %load = load <4 x half>, ptr addrspace(5) %alloca, align 2
-; OPT: store <4 x half> %load, ptr addrspace(1) %out, align 2
+; OPT: %0 = insertelement <4 x half> undef, half 0xH3C00, i32 %sel2
+; OPT: store <4 x half> %0, ptr addrspace(1) %out, align 2
 
 define amdgpu_kernel void @half4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
@@ -136,11 +122,8 @@ entry:
 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x20001
 ; GCN:     v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]]
 
-; OPT: %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
-; OPT: store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, ptr addrspace(5) %alloca, align 2
-; OPT: %0 = load <4 x i16>, ptr addrspace(5) %alloca
-; OPT: %1 = extractelement <4 x i16> %0, i32 %sel2
-; OPT: store i16 %1, ptr addrspace(1) %out, align 2
+; OPT: %0 = extractelement <4 x i16> <i16 1, i16 2, i16 3, i16 4>, i32 %sel2
+; OPT: store i16 %0, ptr addrspace(1) %out, align 2
 
 define amdgpu_kernel void @short4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
@@ -164,12 +147,8 @@ entry:
 ; GCN-NOT: buffer_
 ; GCN:     s_mov_b64 s[{{[0-9:]+}}], 0xffff
 
-; OPT: %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
-; OPT: %0 = load <4 x i16>, ptr addrspace(5) %alloca
-; OPT: %1 = insertelement <4 x i16> %0, i16 1, i32 %sel2
-; OPT: store <4 x i16> %1, ptr addrspace(5) %alloca
-; OPT: %load = load <4 x i16>, ptr addrspace(5) %alloca, align 2
-; OPT: store <4 x i16> %load, ptr addrspace(1) %out, align 2
+; OPT: %0 = insertelement <4 x i16> undef, i16 1, i32 %sel2
+; OPT: store <4 x i16> %0, ptr addrspace(1) %out, align 2
 
 define amdgpu_kernel void @short4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
@@ -193,8 +172,7 @@ entry:
 ; GCN-NOT: buffer_
 ; GCN: v_mov_b32_e32 v1, 0
 
-; OPT: %private_iptr = alloca <2 x i32>, align 8, addrspace(5)
-; OPT: %tmp1 = load i64, ptr addrspace(5) %private_iptr, align 8
+; OPT: ret i64 undef
 
 define i64 @ptr_alloca_bitcast() {
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll b/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
index 5651d1c922cc5c..0baa2e6705ba48 100644
--- a/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
@@ -11,7 +11,7 @@ target datalayout = "A5"
 ; so that we do not need to fully unroll it.
 
 ; FUNC-LABEL: @private_memory
-; LOOP-NOT: alloca
+; LOOP-NOT: = alloca
 ; LOOP: loop.header:
 ; LOOP: br i1 %{{[^,]+}}, label %exit, label %loop.header
 

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
index f91b5d6c2cbfeb..500c33acf3e21b 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
@@ -1,6 +1,7 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-ALLOCA %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-PROMOTE %s
-; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn- -passes=sroa %s -o %t.sroa.ll
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %t.sroa.ll | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-ALLOCA %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %t.sroa.ll | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-PROMOTE %s
+; RUN: opt -S -mtriple=amdgcn-- -passes='sroa,amdgpu-promote-alloca,instcombine' < %s | FileCheck -check-prefix=OPT %s
 
 target datalayout = "A5"
 
@@ -75,8 +76,8 @@ entry:
 ; OPT-LABEL: @vector_write_read_bitcast_to_float(
 ; OPT-NOT:   alloca
 ; OPT: bb2:
-; OPT:  %tmp.sroa.0.0 = phi <6 x float> [ undef, %bb ], [ %0, %bb2 ]
-; OPT:  %0 = insertelement <6 x float> %tmp.sroa.0.0, float %tmp72, i32 %tmp10
+; OPT:  %promotealloca = phi <6 x float> [ undef, %bb ], [ %0, %bb2 ]
+; OPT:  %0 = insertelement <6 x float> %promotealloca, float %tmp71, i32 %tmp10
 ; OPT: .preheader:
 ; OPT:  %bc = bitcast <6 x float> %0 to <6 x i32>
 ; OPT:  %1 = extractelement <6 x i32> %bc, i32 %tmp20
@@ -84,24 +85,13 @@ entry:
 ; GCN-LABEL: {{^}}vector_write_read_bitcast_to_float:
 ; GCN-ALLOCA: buffer_store_dword
 
-; GCN-PROMOTE-COUNT-6: v_cmp_eq_u16
-; GCN-PROMOTE-COUNT-6: v_cndmask
+; GCN-PROMOTE: v_cmp_eq_u16
+; GCN-PROMOTE: v_cndmask
 
 ; GCN: s_cbranch
 
 ; GCN-ALLOCA: buffer_load_dword
 
-; GCN-PROMOTE: v_cmp_eq_u16
-; GCN-PROMOTE: v_cndmask
-; GCN-PROMOTE: v_cmp_eq_u16
-; GCN-PROMOTE: v_cndmask
-; GCN-PROMOTE: v_cmp_eq_u16
-; GCN-PROMOTE: v_cndmask
-; GCN-PROMOTE: v_cmp_eq_u16
-; GCN-PROMOTE: v_cndmask
-; GCN-PROMOTE: v_cmp_eq_u16
-; GCN-PROMOTE: v_cndmask
-
 ; GCN-PROMOTE: ScratchSize: 0
 
 define amdgpu_kernel void @vector_write_read_bitcast_to_float(ptr addrspace(1) %arg) {
@@ -147,8 +137,8 @@ bb15:                                             ; preds = %.preheader
 ; OPT-LABEL: @vector_write_read_bitcast_to_double(
 ; OPT-NOT:   alloca
 ; OPT: bb2:
-; OPT:  %tmp.sroa.0.0 = phi <6 x double> [ undef, %bb ], [ %0, %bb2 ]
-; OPT:  %0 = insertelement <6 x double> %tmp.sroa.0.0, double %tmp72, i32 %tmp10
+; OPT:  %promotealloca = phi <6 x double> [ undef, %bb ], [ %0, %bb2 ]
+; OPT:  %0 = insertelement <6 x double> %promotealloca, double %tmp71, i32 %tmp10
 ; OPT: .preheader:
 ; OPT:  %bc = bitcast <6 x double> %0 to <6 x i64>
 ; OPT:  %1 = extractelement <6 x i64> %bc, i32 %tmp20
@@ -208,8 +198,8 @@ bb15:                                             ; preds = %.preheader
 ; OPT-LABEL: @vector_write_read_bitcast_to_i64(
 ; OPT-NOT:   alloca
 ; OPT: bb2:
-; OPT:  %tmp.sroa.0.0 = phi <6 x i64> [ undef, %bb ], [ %0, %bb2 ]
-; OPT:  %0 = insertelement <6 x i64> %tmp.sroa.0.0, i64 %tmp6, i32 %tmp9
+; OPT:  %promotealloca = phi <6 x i64> [ undef, %bb ], [ %0, %bb2 ]
+; OPT:  %0 = insertelement <6 x i64> %promotealloca, i64 %tmp6, i32 %tmp9
 ; OPT: .preheader:
 ; OPT:  %1 = extractelement <6 x i64> %0, i32 %tmp18
 
@@ -272,7 +262,7 @@ bb13:                                             ; preds = %.preheader
 ; OPT: store i32 %0, ptr addrspace(1) %out, align 4
 
 ; GCN-LABEL: {{^}}vector_read_alloca_bitcast_assume:
-; GCN-COUNT-4: buffer_store_dword
+; GCN-COUNT: buffer_store_dword
 
 define amdgpu_kernel void @vector_read_alloca_bitcast_assume(ptr addrspace(1) %out, i32 %index) {
 entry:


        


More information about the llvm-commits mailing list