[llvm] 7007b99 - Revert "[AMDGPU] Use SSAUpdater in PromoteAlloca"

Wed Jun 28 02:14:24 PDT 2023

Author: pvanhout
Date: 2023-06-28T11:14:17+02:00
New Revision: 7007b9934001ff03a50e355e57c3808c7c9c6350

URL: https://github.com/llvm/llvm-project/commit/7007b9934001ff03a50e355e57c3808c7c9c6350
DIFF: https://github.com/llvm/llvm-project/commit/7007b9934001ff03a50e355e57c3808c7c9c6350.diff

LOG: Revert "[AMDGPU] Use SSAUpdater in PromoteAlloca"

This reverts commit 091bfa76db64fbe96d0e53d99b2068cc05f6aa16.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
    llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
    llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
    llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 2dfb57792da5d..cd289e6470f2c 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -28,9 +28,7 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/CaptureTracking.h"
-#include "llvm/Analysis/InstSimplifyFolder.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IRBuilder.h"
@@ -40,7 +38,6 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
 
 #define DEBUG_TYPE "amdgpu-promote-alloca"
 
@@ -48,20 +45,20 @@ using namespace llvm;
 
 namespace {
 
-static cl::opt<bool>
-    DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
-                                 cl::desc("Disable promote alloca to vector"),
-                                 cl::init(false));
+static cl::opt<bool> DisablePromoteAllocaToVector(
+  "disable-promote-alloca-to-vector",
+  cl::desc("Disable promote alloca to vector"),
+  cl::init(false));
 
-static cl::opt<bool>
-    DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
-                              cl::desc("Disable promote alloca to LDS"),
-                              cl::init(false));
+static cl::opt<bool> DisablePromoteAllocaToLDS(
+  "disable-promote-alloca-to-lds",
+  cl::desc("Disable promote alloca to LDS"),
+  cl::init(false));
 
 static cl::opt<unsigned> PromoteAllocaToVectorLimit(
-    "amdgpu-promote-alloca-to-vector-limit",
-    cl::desc("Maximum byte size to consider promote alloca to vector"),
-    cl::init(0));
+  "amdgpu-promote-alloca-to-vector-limit",
+  cl::desc("Maximum byte size to consider promote alloca to vector"),
+  cl::init(0));
 
 // Shared implementation which can do both promotion to vector and to LDS.
 class AMDGPUPromoteAllocaImpl {
@@ -83,16 +80,17 @@ class AMDGPUPromoteAllocaImpl {
 
   /// BaseAlloca is the alloca root the search started from.
   /// Val may be that alloca or a recursive user of it.
-  bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val,
-                               std::vector<Value *> &WorkList) const;
+  bool collectUsesWithPtrTypes(Value *BaseAlloca,
+                               Value *Val,
+                               std::vector<Value*> &WorkList) const;
 
   /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
   /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
   /// Returns true if both operands are derived from the same alloca. Val should
   /// be the same value as one of the input operands of UseInst.
   bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
-                                       Instruction *UseInst, int OpIdx0,
-                                       int OpIdx1) const;
+                                       Instruction *UseInst,
+                                       int OpIdx0, int OpIdx1) const;
 
   /// Check whether we have enough local memory for promotion.
   bool hasSufficientLocalMem(const Function &F);
@@ -255,10 +253,6 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
       Changed = true;
   }
 
-  // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
-  // dangling pointers. If we want to reuse it past this point, the loop above
-  // would need to be updated to remove successfully promoted allocas.
-
   return Changed;
 }
 
@@ -275,10 +269,6 @@ static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI,
   using namespace PatternMatch;
   // For now we only care about non-volatile memsets that affect the whole type
   // (start at index 0 and fill the whole alloca).
-  //
-  // TODO: Now that we moved to PromoteAlloca we could handle any memsets
-  // (except maybe volatile ones?) - we just need to use shufflevector if it
-  // only affects a subset of the vector.
   const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType());
   return I->getOperand(0) == AI &&
          match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
@@ -329,107 +319,6 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
   return ConstantInt::get(GEP->getContext(), Quot);
 }
 
-void promoteAllocaUserToVector(
-    Instruction *Inst, const DataLayout &DL, SSAUpdater &Updater,
-    FixedVectorType *VectorTy, unsigned VecStoreSize, unsigned ElementSize,
-    DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo,
-    std::map<GetElementPtrInst *, Value *> &GEPVectorIdx) {
-  // Note: we use InstSimplifyFolder because it can leverage the DataLayout
-  // to do more folding, especially in the case of vector splats.
-  IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
-                                        InstSimplifyFolder(DL));
-  Builder.SetInsertPoint(Inst);
-
-  Type *VecEltTy = VectorTy->getElementType();
-
-  switch (Inst->getOpcode()) {
-  case Instruction::Load: {
-    Value *Vec = Updater.GetValueAtEndOfBlock(Inst->getParent());
-    Value *Index = calculateVectorIndex(
-        cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx);
-
-    // loading the full vector
-    if (DL.getTypeStoreSize(Inst->getType()) == VecStoreSize) {
-      assert(cast<Constant>(Index)->isZeroValue());
-
-      Value *NewVal = Builder.CreateBitCast(Vec, Inst->getType());
-      Inst->replaceAllUsesWith(NewVal);
-      break;
-    }
-
-    Value *ExtractElement = Builder.CreateExtractElement(Vec, Index);
-    if (Inst->getType() != VecEltTy)
-      ExtractElement =
-          Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType());
-
-    Inst->replaceAllUsesWith(ExtractElement);
-    break;
-  }
-  case Instruction::Store: {
-    StoreInst *SI = cast<StoreInst>(Inst);
-    Value *Vec = Updater.GetValueAtEndOfBlock(Inst->getParent());
-    Value *Index = calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx);
-    Value *Elt = SI->getValueOperand();
-
-    // Storing the full vector
-    if (DL.getTypeStoreSize(Elt->getType()) == VecStoreSize) {
-      assert(cast<Constant>(Index)->isZeroValue());
-      Updater.AddAvailableValue(Inst->getParent(),
-                                Builder.CreateBitCast(Elt, VectorTy));
-      break;
-    }
-
-    if (Elt->getType() != VecEltTy)
-      Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy);
-    Value *NewVec = Builder.CreateInsertElement(Vec, Elt, Index);
-
-    Updater.AddAvailableValue(Inst->getParent(), NewVec);
-    break;
-  }
-  case Instruction::Call: {
-    if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst)) {
-      ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
-      unsigned NumCopied = Length->getZExtValue() / ElementSize;
-      MemTransferInfo *TI = &TransferInfo[cast<MemTransferInst>(Inst)];
-      unsigned SrcBegin = TI->SrcIndex->getZExtValue();
-      unsigned DestBegin = TI->DestIndex->getZExtValue();
-
-      SmallVector<int> Mask;
-      for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
-        if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
-          Mask.push_back(SrcBegin++);
-        } else {
-          Mask.push_back(Idx);
-        }
-      }
-
-      Value *Vec = Updater.GetValueAtEndOfBlock(Inst->getParent());
-      Value *NewVec = Builder.CreateShuffleVector(Vec, Mask);
-
-      Updater.AddAvailableValue(Inst->getParent(), NewVec);
-    } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
-      Value *Elt = MSI->getOperand(1);
-      if (DL.getTypeStoreSize(VecEltTy) > 1) {
-        Value *EltBytes =
-            Builder.CreateVectorSplat(DL.getTypeStoreSize(VecEltTy), Elt);
-        Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
-      }
-
-      Value *Splat =
-          Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt);
-
-      Updater.AddAvailableValue(Inst->getParent(), Splat);
-    } else {
-      llvm_unreachable("Unsupported call when promoting alloca to vector");
-    }
-    break;
-  }
-
-  default:
-    llvm_unreachable("Inconsistency in instructions promotable to vector");
-  }
-}
-
 // FIXME: Should try to pick the most likely to be profitable allocas first.
 bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
@@ -476,7 +365,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
 
   std::map<GetElementPtrInst *, Value *> GEPVectorIdx;
   SmallVector<Instruction *> WorkList;
-  SmallVector<Instruction *> UsersToRemove;
   SmallVector<Instruction *> DeferredInsts;
   SmallVector<Use *, 8> Uses;
   DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
@@ -505,18 +393,12 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
         return RejectUser(Inst, "pointer is being stored");
 
       Type *AccessTy = getLoadStoreType(Inst);
-      if (AccessTy->isAggregateType())
-        return RejectUser(Inst, "unsupported load/store as aggregate");
-      assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
-
       Ptr = Ptr->stripPointerCasts();
 
-      // Alloca already accessed as vector.
+      // Alloca already accessed as vector, leave alone.
       if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) ==
-                                DL->getTypeStoreSize(AccessTy)) {
-        WorkList.push_back(Inst);
+                                DL->getTypeStoreSize(AccessTy))
         continue;
-      }
 
       // Check that this is a simple access of a vector element.
       bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
@@ -534,7 +416,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       // Look through bitcasts.
       for (Use &U : Inst->uses())
         Uses.push_back(&U);
-      UsersToRemove.push_back(Inst);
       continue;
     }
 
@@ -548,7 +429,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       GEPVectorIdx[GEP] = Index;
       for (Use &U : Inst->uses())
         Uses.push_back(&U);
-      UsersToRemove.push_back(Inst);
       continue;
     }
 
@@ -601,17 +481,13 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
     }
 
     // Ignore assume-like intrinsics and comparisons used in assumes.
-    if (isAssumeLikeIntrinsic(Inst)) {
-      UsersToRemove.push_back(Inst);
+    if (isAssumeLikeIntrinsic(Inst))
       continue;
-    }
 
     if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
           return isAssumeLikeIntrinsic(cast<Instruction>(U));
-        })) {
-      UsersToRemove.push_back(Inst);
+        }))
       continue;
-    }
 
     return RejectUser(Inst, "unhandled alloca user");
   }
@@ -630,66 +506,80 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
                     << *VectorTy << '\n');
 
-  SSAUpdater Updater;
-  Updater.Initialize(VectorTy, "promotealloca");
-
-  // alloca is uninitialized memory. Imitate that by making the first value
-  // undef.
-  Updater.AddAvailableValue(Alloca.getParent(), UndefValue::get(VectorTy));
-
-  // Bucket up uses of the alloca by the block they occur in.
-  // This is important because we have to handle multiple defs/uses in a block
-  // ourselves: SSAUpdater is purely for cross-block references.
-  DenseMap<BasicBlock *, SmallDenseSet<Instruction *>> UsesByBlock;
-  for (Instruction *User : WorkList)
-    UsesByBlock[User->getParent()].insert(User);
-
-  const unsigned VecStoreSize = DL->getTypeStoreSize(VectorTy);
-  for (Instruction *User : WorkList) {
-    BasicBlock *BB = User->getParent();
-    auto &BlockUses = UsesByBlock[BB];
-
-    // Already processed, skip.
-    if (BlockUses.empty())
-      continue;
-
-    // Only user in the block, directly process it.
-    if (BlockUses.size() == 1) {
-      promoteAllocaUserToVector(User, *DL, Updater, VectorTy, VecStoreSize,
-                                ElementSize, TransferInfo, GEPVectorIdx);
-      continue;
+  for (Instruction *Inst : WorkList) {
+    IRBuilder<> Builder(Inst);
+    switch (Inst->getOpcode()) {
+    case Instruction::Load: {
+      Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
+      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+      Type *VecPtrTy = VectorTy->getPointerTo(Alloca.getAddressSpace());
+      Value *BitCast = Builder.CreateBitCast(&Alloca, VecPtrTy);
+      Value *VecValue =
+          Builder.CreateAlignedLoad(VectorTy, BitCast, Alloca.getAlign());
+      Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
+      if (Inst->getType() != VecEltTy)
+        ExtractElement =
+            Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType());
+      Inst->replaceAllUsesWith(ExtractElement);
+      Inst->eraseFromParent();
+      break;
     }
-
-    // Multiple users in the block, do a linear scan to promote users in order.
-    for (Instruction &Inst : *BB) {
-      if (!BlockUses.contains(&Inst))
-        continue;
-
-      promoteAllocaUserToVector(&Inst, *DL, Updater, VectorTy, VecStoreSize,
-                                ElementSize, TransferInfo, GEPVectorIdx);
+    case Instruction::Store: {
+      StoreInst *SI = cast<StoreInst>(Inst);
+      Value *Ptr = SI->getPointerOperand();
+      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+      Type *VecPtrTy = VectorTy->getPointerTo(Alloca.getAddressSpace());
+      Value *BitCast = Builder.CreateBitCast(&Alloca, VecPtrTy);
+      Value *VecValue =
+          Builder.CreateAlignedLoad(VectorTy, BitCast, Alloca.getAlign());
+      Value *Elt = SI->getValueOperand();
+      if (Elt->getType() != VecEltTy)
+        Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy);
+      Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index);
+      Builder.CreateAlignedStore(NewVecValue, BitCast, Alloca.getAlign());
+      Inst->eraseFromParent();
+      break;
+    }
+    case Instruction::Call: {
+      if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst)) {
+        ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
+        unsigned NumCopied = Length->getZExtValue() / ElementSize;
+        MemTransferInfo *TI = &TransferInfo[cast<MemTransferInst>(Inst)];
+        unsigned SrcBegin = TI->SrcIndex->getZExtValue();
+        unsigned DestBegin = TI->DestIndex->getZExtValue();
+
+        SmallVector<int> Mask;
+        for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
+          if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
+            Mask.push_back(SrcBegin++);
+          } else {
+            Mask.push_back(Idx);
+          }
+        }
+        Type *VecPtrTy = VectorTy->getPointerTo(Alloca.getAddressSpace());
+        Value *BitCast = Builder.CreateBitCast(&Alloca, VecPtrTy);
+        Value *VecValue =
+            Builder.CreateAlignedLoad(VectorTy, BitCast, Alloca.getAlign());
+        Value *NewVecValue = Builder.CreateShuffleVector(VecValue, Mask);
+        Builder.CreateAlignedStore(NewVecValue, BitCast, Alloca.getAlign());
+
+        Inst->eraseFromParent();
+      } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
+        // Ensure the length parameter of the memsets matches the new vector
+        // type's. In general, the type size shouldn't change so this is a
+        // no-op, but it's better to be safe.
+        MSI->setOperand(2, Builder.getInt64(DL->getTypeStoreSize(VectorTy)));
+      } else {
+        llvm_unreachable("Unsupported call when promoting alloca to vector");
+      }
+      break;
     }
 
-    // Clear the block so we know it's been processed.
-    BlockUses.clear();
-  }
-
-  // Delete worklist instructions
-  for (Instruction *I : WorkList) {
-    assert(I->use_empty());
-    I->eraseFromParent();
-  }
-
-  // Delete all the users that are known to be removeable.
-  for (Instruction *I : reverse(UsersToRemove)) {
-    I->dropDroppableUses();
-    assert(I->use_empty());
-    I->eraseFromParent();
+    default:
+      llvm_unreachable("Inconsistency in instructions promotable to vector");
+    }
   }
 
-  // Alloca should now be dead too.
-  assert(Alloca.use_empty());
-  Alloca.eraseFromParent();
-
   return true;
 }
 
@@ -1177,7 +1067,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
 
   CurrentLocalMemUsage = NewSize;
 
-  std::vector<Value *> WorkList;
+  std::vector<Value*> WorkList;
 
   if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
     LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
@@ -1210,7 +1100,9 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
   TID = Builder.CreateAdd(TID, TIdZ);
 
   Value *Indices[] = {
-      Constant::getNullValue(Type::getInt32Ty(Mod->getContext())), TID};
+    Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
+    TID
+  };
 
   Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
   I.mutateType(Offset->getType());
@@ -1324,9 +1216,10 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
     assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
 
     MemTransferInst *MI = cast<MemTransferInst>(Intr);
-    auto *B = Builder.CreateMemTransferInst(
-        ID, MI->getRawDest(), MI->getDestAlign(), MI->getRawSource(),
-        MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
+    auto *B =
+      Builder.CreateMemTransferInst(ID, MI->getRawDest(), MI->getDestAlign(),
+                                    MI->getRawSource(), MI->getSourceAlign(),
+                                    MI->getLength(), MI->isVolatile());
 
     for (unsigned I = 0; I != 2; ++I) {
       if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {

diff  --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
index 4da6f0e446689..97383490841e5 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
@@ -13,21 +13,38 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s16, s33
-; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_add_i32 s33, s32, 0xfc0
+; GCN-NEXT:    s_and_b32 s33, s33, 0xfffff000
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
-; GCN-NEXT:    s_addk_i32 s32, 0x800
+; GCN-NEXT:    s_addk_i32 s32, 0x3000
 ; GCN-NEXT:    v_writelane_b32 v43, s16, 0
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, extern_func at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, extern_func at gotpcrel32@hi+12
 ; GCN-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GCN-NEXT:    v_writelane_b32 v42, s30, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, v8
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v42, s30, 0
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:92
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:88
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:84
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:80
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:76
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:72
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:68
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, v8
 ; GCN-NEXT:    v_writelane_b32 v42, s31, 1
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
@@ -41,10 +58,10 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 {
 ; GCN-NEXT:    v_readlane_b32 s30, v42, 0
 ; GCN-NEXT:    v_readlane_b32 s4, v43, 0
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    s_addk_i32 s32, 0xf800
+; GCN-NEXT:    s_addk_i32 s32, 0xd000
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
index 68737cb227a00..785ac5ad51a04 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=sroa,amdgpu-promote-alloca < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
 
 ; Make sure that array alloca loaded and stored as multi-element aggregates are handled correctly
 ; Strictly the promote-alloca pass shouldn't have to deal with this case as it is non-canonical, but
@@ -16,16 +16,19 @@
 
 define amdgpu_vs void @promote_1d_aggr() #0 {
 ; CHECK-LABEL: @promote_1d_aggr(
+; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
 ; CHECK-NEXT:    [[F1:%.*]] = alloca [1 x float], align 4, addrspace(5)
 ; CHECK-NEXT:    [[FOO:%.*]] = getelementptr [[BLOCK:%.*]], ptr addrspace(1) @block, i32 0, i32 1
 ; CHECK-NEXT:    [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
+; CHECK-NEXT:    store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = load [1 x float], ptr addrspace(1) @block, align 4
-; CHECK-NEXT:    [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [1 x float] [[FOO3]], 0
-; CHECK-NEXT:    [[FOO3_FCA_0_GEP:%.*]] = getelementptr inbounds [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 0
-; CHECK-NEXT:    store float [[FOO3_FCA_0_EXTRACT]], ptr addrspace(5) [[FOO3_FCA_0_GEP]], align 4
-; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO1]]
+; CHECK-NEXT:    store [1 x float] [[FOO3]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(5) [[I]], align 4
+; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
 ; CHECK-NEXT:    [[FOO6:%.*]] = load float, ptr addrspace(5) [[FOO5]], align 4
-; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> undef, float [[FOO6]], i32 0
+; CHECK-NEXT:    [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+; CHECK-NEXT:    [[FOO8:%.*]] = load <4 x float>, ptr addrspace(5) [[FOO7]], align 16
+; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[FOO6]], i32 0
 ; CHECK-NEXT:    [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[FOO6]], i32 1
 ; CHECK-NEXT:    [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[FOO6]], i32 2
 ; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[FOO6]], i32 3
@@ -57,12 +60,22 @@ define amdgpu_vs void @promote_1d_aggr() #0 {
 
 define amdgpu_vs void @promote_store_aggr() #0 {
 ; CHECK-LABEL: @promote_store_aggr(
+; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
+; CHECK-NEXT:    [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5)
 ; CHECK-NEXT:    [[FOO1:%.*]] = load i32, ptr addrspace(1) @block2, align 4
-; CHECK-NEXT:    [[FOO3:%.*]] = sitofp i32 [[FOO1]] to float
-; CHECK-NEXT:    [[FOO6_FCA_0_INSERT:%.*]] = insertvalue [2 x float] poison, float [[FOO3]], 0
-; CHECK-NEXT:    [[FOO6_FCA_1_INSERT:%.*]] = insertvalue [2 x float] [[FOO6_FCA_0_INSERT]], float 2.000000e+00, 1
+; CHECK-NEXT:    store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4
+; CHECK-NEXT:    [[FOO2:%.*]] = load i32, ptr addrspace(5) [[I]], align 4
+; CHECK-NEXT:    [[FOO3:%.*]] = sitofp i32 [[FOO2]] to float
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3]], i32 0
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [2 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float 2.000000e+00, i64 1
+; CHECK-NEXT:    store <2 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[FOO6:%.*]] = load [2 x float], ptr addrspace(5) [[F1]], align 4
 ; CHECK-NEXT:    [[FOO7:%.*]] = getelementptr [[BLOCK2:%.*]], ptr addrspace(1) @block2, i32 0, i32 1
-; CHECK-NEXT:    store [2 x float] [[FOO6_FCA_1_INSERT]], ptr addrspace(1) [[FOO7]], align 4
+; CHECK-NEXT:    store [2 x float] [[FOO6]], ptr addrspace(1) [[FOO7]], align 4
 ; CHECK-NEXT:    store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, ptr addrspace(1) @pv, align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -87,18 +100,23 @@ define amdgpu_vs void @promote_store_aggr() #0 {
 
 define amdgpu_vs void @promote_load_from_store_aggr() #0 {
 ; CHECK-LABEL: @promote_load_from_store_aggr(
+; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
+; CHECK-NEXT:    [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5)
 ; CHECK-NEXT:    [[FOO:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 1
 ; CHECK-NEXT:    [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
+; CHECK-NEXT:    store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = load [2 x float], ptr addrspace(1) @block3, align 4
-; CHECK-NEXT:    [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float [[FOO3_FCA_0_EXTRACT]], i32 0
-; CHECK-NEXT:    [[FOO3_FCA_1_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3_FCA_1_EXTRACT]], i64 1
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO1]]
-; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
-; CHECK-NEXT:    [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP3]], i32 2
-; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP3]], i32 3
+; CHECK-NEXT:    store [2 x float] [[FOO3]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(5) [[I]], align 4
+; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [2 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 [[FOO4]]
+; CHECK-NEXT:    [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+; CHECK-NEXT:    [[FOO8:%.*]] = load <4 x float>, ptr addrspace(5) [[FOO7]], align 16
+; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[TMP2]], i32 0
+; CHECK-NEXT:    [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP2]], i32 1
+; CHECK-NEXT:    [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP2]], i32 2
+; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP2]], i32 3
 ; CHECK-NEXT:    store <4 x float> [[FOO12]], ptr addrspace(1) @pv, align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -124,7 +142,22 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 {
 
 define amdgpu_vs void @promote_memmove_aggr() #0 {
 ; CHECK-LABEL: @promote_memmove_aggr(
-; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) @pv, align 4
+; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
+; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[FOO1:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 1.000000e+00, i64 1
+; CHECK-NEXT:    store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 2.000000e+00, i64 3
+; CHECK-NEXT:    store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 4>
+; CHECK-NEXT:    store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0
+; CHECK-NEXT:    store float [[TMP8]], ptr addrspace(1) @pv, align 4
 ; CHECK-NEXT:    ret void
 ;
   %f1 = alloca [5 x float], addrspace(5)
@@ -141,12 +174,24 @@ define amdgpu_vs void @promote_memmove_aggr() #0 {
 
 define amdgpu_vs void @promote_memcpy_aggr() #0 {
 ; CHECK-LABEL: @promote_memcpy_aggr(
+; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
+; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 2.000000e+00, i64 3
+; CHECK-NEXT:    store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
 ; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <5 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 2.000000e+00, float 0.000000e+00>, float 3.000000e+00, i32 [[FOO4]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    store float [[TMP3]], ptr addrspace(1) @pv, align 4
+; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 3.000000e+00, i32 [[FOO4]]
+; CHECK-NEXT:    store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0
+; CHECK-NEXT:    store float [[TMP8]], ptr addrspace(1) @pv, align 4
 ; CHECK-NEXT:    ret void
 ;
   %f1 = alloca [5 x float], addrspace(5)
@@ -168,7 +213,22 @@ define amdgpu_vs void @promote_memcpy_aggr() #0 {
 
 define amdgpu_vs void @promote_memcpy_identity_aggr() #0 {
 ; CHECK-LABEL: @promote_memcpy_identity_aggr(
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) @pv, align 4
+; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
+; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[FOO1:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 1.000000e+00, i64 1
+; CHECK-NEXT:    store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 2.000000e+00, i64 3
+; CHECK-NEXT:    store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0
+; CHECK-NEXT:    store float [[TMP8]], ptr addrspace(1) @pv, align 4
 ; CHECK-NEXT:    ret void
 ;
   %f1 = alloca [5 x float], addrspace(5)
@@ -188,26 +248,8 @@ define amdgpu_vs void @promote_memcpy_two_aggrs() #0 {
 ; CHECK-LABEL: @promote_memcpy_two_aggrs(
 ; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
 ; CHECK-NEXT:    [[F2:%.*]] = alloca [5 x float], align 4, addrspace(5)
-; CHECK-NEXT:    [[DOTFCA_0_GEP1:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 0
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP1]], align 4
-; CHECK-NEXT:    [[DOTFCA_1_GEP2:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP2]], align 4
-; CHECK-NEXT:    [[DOTFCA_2_GEP3:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 2
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP3]], align 4
-; CHECK-NEXT:    [[DOTFCA_3_GEP4:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP4]], align 4
-; CHECK-NEXT:    [[DOTFCA_4_GEP5:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 4
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP5]], align 4
-; CHECK-NEXT:    [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 0
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP]], align 4
-; CHECK-NEXT:    [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 1
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP]], align 4
-; CHECK-NEXT:    [[DOTFCA_2_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 2
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP]], align 4
-; CHECK-NEXT:    [[DOTFCA_3_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 3
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP]], align 4
-; CHECK-NEXT:    [[DOTFCA_4_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 4
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP]], align 4
+; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F2]], align 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
 ; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
 ; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
@@ -241,16 +283,7 @@ define amdgpu_vs void @promote_memcpy_two_aggrs() #0 {
 define amdgpu_vs void @promote_memcpy_p1p5_aggr(ptr addrspace(1) inreg %src) #0 {
 ; CHECK-LABEL: @promote_memcpy_p1p5_aggr(
 ; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
-; CHECK-NEXT:    [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 0
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP]], align 4
-; CHECK-NEXT:    [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP]], align 4
-; CHECK-NEXT:    [[DOTFCA_2_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 2
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP]], align 4
-; CHECK-NEXT:    [[DOTFCA_3_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP]], align 4
-; CHECK-NEXT:    [[DOTFCA_4_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 4
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP]], align 4
+; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
 ; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
 ; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
@@ -272,12 +305,21 @@ define amdgpu_vs void @promote_memcpy_p1p5_aggr(ptr addrspace(1) inreg %src) #0
 
 define amdgpu_vs void @promote_memcpy_inline_aggr() #0 {
 ; CHECK-LABEL: @promote_memcpy_inline_aggr(
+; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
+; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
 ; CHECK-NEXT:    [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
 ; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <5 x float> zeroinitializer, float 3.000000e+00, i32 [[FOO4]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    store float [[TMP3]], ptr addrspace(1) @pv, align 4
+; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 3.000000e+00, i32 [[FOO4]]
+; CHECK-NEXT:    store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <5 x float> [[TMP3]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <5 x float> [[TMP5]], i32 0
+; CHECK-NEXT:    store float [[TMP6]], ptr addrspace(1) @pv, align 4
 ; CHECK-NEXT:    ret void
 ;
   %f1 = alloca [5 x float], addrspace(5)
@@ -305,16 +347,30 @@ declare void @llvm.memmove.p5i8.p5i8.i32(ptr addrspace(5) nocapture writeonly, p
 
 define amdgpu_ps void @promote_double_aggr() #0 {
 ; CHECK-LABEL: @promote_double_aggr(
+; CHECK-NEXT:    [[S:%.*]] = alloca [2 x double], align 8, addrspace(5)
 ; CHECK-NEXT:    [[FOO:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0
 ; CHECK-NEXT:    [[FOO1:%.*]] = load double, ptr addrspace(1) [[FOO]], align 8
 ; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 1
 ; CHECK-NEXT:    [[FOO3:%.*]] = load double, ptr addrspace(1) [[FOO2]], align 8
 ; CHECK-NEXT:    [[FOO4:%.*]] = insertvalue [2 x double] undef, double [[FOO1]], 0
 ; CHECK-NEXT:    [[FOO5:%.*]] = insertvalue [2 x double] [[FOO4]], double [[FOO3]], 1
-; CHECK-NEXT:    [[FOO5_FCA_0_EXTRACT:%.*]] = extractvalue [2 x double] [[FOO5]], 0
-; CHECK-NEXT:    [[FOO5_FCA_1_EXTRACT:%.*]] = extractvalue [2 x double] [[FOO5]], 1
-; CHECK-NEXT:    [[FOO10:%.*]] = fadd double [[FOO5_FCA_1_EXTRACT]], [[FOO5_FCA_1_EXTRACT]]
-; CHECK-NEXT:    [[FOO16:%.*]] = fadd double [[FOO10]], [[FOO5_FCA_1_EXTRACT]]
+; CHECK-NEXT:    store [2 x double] [[FOO5]], ptr addrspace(5) [[S]], align 8
+; CHECK-NEXT:    [[FOO6:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i64 1
+; CHECK-NEXT:    [[FOO8:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i64 1
+; CHECK-NEXT:    [[FOO10:%.*]] = fadd double [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[FOO10]], i32 0
+; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr addrspace(5) [[S]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
+; CHECK-NEXT:    [[FOO14:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i64 1
+; CHECK-NEXT:    [[FOO16:%.*]] = fadd double [[TMP8]], [[TMP10]]
 ; CHECK-NEXT:    [[FOO17:%.*]] = fptrunc double [[FOO16]] to float
 ; CHECK-NEXT:    [[FOO18:%.*]] = insertelement <4 x float> undef, float [[FOO17]], i32 0
 ; CHECK-NEXT:    [[FOO19:%.*]] = insertelement <4 x float> [[FOO18]], float [[FOO17]], i32 1
@@ -354,6 +410,21 @@ define amdgpu_ps void @promote_double_aggr() #0 {
 define amdgpu_kernel void @alloca_struct() #0 {
 ; CHECK-LABEL: @alloca_struct(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load !0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1:![0-9]+]], !invariant.load !0
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.z(), !range [[RNG2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x [2 x %struct]], ptr addrspace(3) @alloca_struct.alloca, i32 0, i32 [[TMP13]]
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
index 4cec3bd41ce2f..3596c96b8cd79 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
@@ -6,7 +6,7 @@
 @global_array1 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4
 
 ; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) {
-; IR-NOT: alloca [10 x i32]
+; IR: alloca [10 x i32]
 ; ASM-LABEL: {{^}}promote_alloca_size_256:
 ; ASM: .amdgpu_lds llvm.amdgcn.kernel.promote_alloca_size_256.lds, 60000, 16
 ; ASM-NOT: .amdgpu_lds

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
index a99c01edcc12d..f31421de517cb 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
@@ -1,13 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca,sroa < %s | FileCheck %s
 
 ; Checks that memsets don't block PromoteAlloca.
 
+; Note: memsets are just updated with the new type size. They are not eliminated which means
+; the original alloca also stay. This puts a bit more load on SROA.
+; If PromoteAlloca is moved to SSAUpdater, we could just entirely replace the memsets with
+; e.g. ConstantAggregate.
+
 define amdgpu_kernel void @memset_all_zero(i64 %val) {
 ; CHECK-LABEL: @memset_all_zero(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <6 x i64> zeroinitializer, i64 [[VAL:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <6 x i64> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i64 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -24,7 +30,8 @@ define amdgpu_kernel void @memset_all_5(i64 %val) {
 ; CHECK-LABEL: @memset_all_5(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i64> <i64 361700864190383365, i64 361700864190383365, i64 361700864190383365, i64 361700864190383365>, i64 [[VAL:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i64 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -40,9 +47,11 @@ entry:
 define amdgpu_kernel void @memset_volatile_nopromote(i64 %val) {
 ; CHECK-LABEL: @memset_volatile_nopromote(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
-; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 32, i1 true)
-; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4
+; CHECK-NEXT:    [[STACK_SROA_0:%.*]] = alloca i64, align 8, addrspace(5)
+; CHECK-NEXT:    [[STACK_SROA_2:%.*]] = alloca [3 x i64], align 8, addrspace(5)
+; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_0]], i8 0, i64 8, i1 true)
+; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_2]], i8 0, i64 24, i1 true)
+; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK_SROA_0]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -55,9 +64,11 @@ entry:
 define amdgpu_kernel void @memset_badsize_nopromote(i64 %val) {
 ; CHECK-LABEL: @memset_badsize_nopromote(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
-; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 31, i1 true)
-; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4
+; CHECK-NEXT:    [[STACK_SROA_0:%.*]] = alloca i64, align 8, addrspace(5)
+; CHECK-NEXT:    [[STACK_SROA_2:%.*]] = alloca [23 x i8], align 4, addrspace(5)
+; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_0]], i8 0, i64 8, i1 true)
+; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 4 [[STACK_SROA_2]], i8 0, i64 23, i1 true)
+; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK_SROA_0]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -70,10 +81,8 @@ entry:
 define amdgpu_kernel void @memset_offset_ptr_nopromote(i64 %val) {
 ; CHECK-LABEL: @memset_offset_ptr_nopromote(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [4 x i64], ptr addrspace(5) [[STACK]], i64 0, i64 1
-; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[GEP]], i8 0, i64 24, i1 true)
-; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4
+; CHECK-NEXT:    [[STACK_SROA_1:%.*]] = alloca [3 x i64], align 8, addrspace(5)
+; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_1]], i8 0, i64 24, i1 true)
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll
index 8df15e3f7e29a..70b4e94f36c07 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll
@@ -4,10 +4,15 @@
 define i64 @test_pointer_array(i64 %v) {
 ; OPT-LABEL: @test_pointer_array(
 ; OPT-NEXT:  entry:
-; OPT-NEXT:    [[TMP0:%.*]] = inttoptr i64 [[V:%.*]] to ptr
-; OPT-NEXT:    [[TMP1:%.*]] = insertelement <3 x ptr> undef, ptr [[TMP0]], i32 0
-; OPT-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP0]] to i64
-; OPT-NEXT:    ret i64 [[TMP2]]
+; OPT-NEXT:    [[A:%.*]] = alloca [3 x ptr], align 16, addrspace(5)
+; OPT-NEXT:    [[TMP0:%.*]] = load <3 x ptr>, ptr addrspace(5) [[A]], align 16
+; OPT-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[V:%.*]] to ptr
+; OPT-NEXT:    [[TMP2:%.*]] = insertelement <3 x ptr> [[TMP0]], ptr [[TMP1]], i32 0
+; OPT-NEXT:    store <3 x ptr> [[TMP2]], ptr addrspace(5) [[A]], align 16
+; OPT-NEXT:    [[TMP3:%.*]] = load <3 x ptr>, ptr addrspace(5) [[A]], align 16
+; OPT-NEXT:    [[TMP4:%.*]] = extractelement <3 x ptr> [[TMP3]], i32 0
+; OPT-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; OPT-NEXT:    ret i64 [[TMP5]]
 ;
 entry:
   %a = alloca [3 x ptr], align 16, addrspace(5)

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
index 083ed999ac371..adabeab379505 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -passes=sroa,amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -passes=amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
 
 ; GCN-LABEL: {{^}}float4_alloca_store4:
 ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4
@@ -11,8 +11,11 @@
 ; GCN: v_cndmask_b32_e32 [[RES:v[0-9]+]], 4.0,
 ; GCN: store_dword v{{.+}}, [[RES]]
 
-; OPT:  %0 = extractelement <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, i32 %sel2
-; OPT:  store float %0, ptr addrspace(1) %out, align 4
+; OPT:  %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+; OPT:  store <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, ptr addrspace(5) %alloca, align 4
+; OPT:  %0 = load <4 x float>, ptr addrspace(5) %alloca
+; OPT:  %1 = extractelement <4 x float> %0, i32 %sel2
+; OPT:  store float %1, ptr addrspace(1) %out, align 4
 
 define amdgpu_kernel void @float4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
@@ -43,8 +46,12 @@ entry:
 ; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
 ; GCN:     store_dwordx4 v{{.+}},
 
-; OPT: %0 = insertelement <4 x float> undef, float 1.000000e+00, i32 %sel2
-; OPT: store <4 x float> %0, ptr addrspace(1) %out, align 4
+; OPT: %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+; OPT: %0 = load <4 x float>, ptr addrspace(5) %alloca
+; OPT: %1 = insertelement <4 x float> %0, float 1.000000e+00, i32 %sel2
+; OPT: store <4 x float> %1, ptr addrspace(5) %alloca
+; OPT: %load = load <4 x float>, ptr addrspace(5) %alloca, align 4
+; OPT:  store <4 x float> %load, ptr addrspace(1) %out, align 4
 
 define amdgpu_kernel void @float4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
@@ -70,8 +77,11 @@ entry:
 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
 ; GCN:     v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]]
 
-; OPT: %0 = extractelement <4 x half> <half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400>, i32 %sel2
-; OPT: store half %0, ptr addrspace(1) %out, align 2
+; OPT: %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+; OPT: store <4 x half> <half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400>, ptr addrspace(5) %alloca, align 2
+; OPT: %0 = load <4 x half>, ptr addrspace(5) %alloca
+; OPT: %1 = extractelement <4 x half> %0, i32 %sel2
+; OPT: store half %1, ptr addrspace(1) %out, align 2
 
 define amdgpu_kernel void @half4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
@@ -95,8 +105,12 @@ entry:
 ; GCN-NOT: buffer_
 ; GCN:     s_mov_b64 s[{{[0-9:]+}}], 0xffff
 
-; OPT: %0 = insertelement <4 x half> undef, half 0xH3C00, i32 %sel2
-; OPT: store <4 x half> %0, ptr addrspace(1) %out, align 2
+; OPT: %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+; OPT: %0 = load <4 x half>, ptr addrspace(5) %alloca
+; OPT: %1 = insertelement <4 x half> %0, half 0xH3C00, i32 %sel2
+; OPT: store <4 x half> %1, ptr addrspace(5) %alloca
+; OPT: %load = load <4 x half>, ptr addrspace(5) %alloca, align 2
+; OPT: store <4 x half> %load, ptr addrspace(1) %out, align 2
 
 define amdgpu_kernel void @half4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
@@ -122,8 +136,11 @@ entry:
 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x20001
 ; GCN:     v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]]
 
-; OPT: %0 = extractelement <4 x i16> <i16 1, i16 2, i16 3, i16 4>, i32 %sel2
-; OPT: store i16 %0, ptr addrspace(1) %out, align 2
+; OPT: %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+; OPT: store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, ptr addrspace(5) %alloca, align 2
+; OPT: %0 = load <4 x i16>, ptr addrspace(5) %alloca
+; OPT: %1 = extractelement <4 x i16> %0, i32 %sel2
+; OPT: store i16 %1, ptr addrspace(1) %out, align 2
 
 define amdgpu_kernel void @short4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
@@ -147,8 +164,12 @@ entry:
 ; GCN-NOT: buffer_
 ; GCN:     s_mov_b64 s[{{[0-9:]+}}], 0xffff
 
-; OPT: %0 = insertelement <4 x i16> undef, i16 1, i32 %sel2
-; OPT: store <4 x i16> %0, ptr addrspace(1) %out, align 2
+; OPT: %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+; OPT: %0 = load <4 x i16>, ptr addrspace(5) %alloca
+; OPT: %1 = insertelement <4 x i16> %0, i16 1, i32 %sel2
+; OPT: store <4 x i16> %1, ptr addrspace(5) %alloca
+; OPT: %load = load <4 x i16>, ptr addrspace(5) %alloca, align 2
+; OPT: store <4 x i16> %load, ptr addrspace(1) %out, align 2
 
 define amdgpu_kernel void @short4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
@@ -172,7 +193,8 @@ entry:
 ; GCN-NOT: buffer_
 ; GCN: v_mov_b32_e32 v1, 0
 
-; OPT: ret i64 undef
+; OPT: %private_iptr = alloca <2 x i32>, align 8, addrspace(5)
+; OPT: %tmp1 = load i64, ptr addrspace(5) %private_iptr, align 8
 
 define i64 @ptr_alloca_bitcast() {
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll b/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
index 2e629a4d73d46..5651d1c922cc5 100644
--- a/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
@@ -12,6 +12,8 @@ target datalayout = "A5"
 
 ; FUNC-LABEL: @private_memory
 ; LOOP-NOT: alloca
+; LOOP: loop.header:
+; LOOP: br i1 %{{[^,]+}}, label %exit, label %loop.header
 
 ; FULL-UNROLL: alloca
 ; FULL-UNROLL-COUNT-256: store i32 {{[0-9]+}}, ptr addrspace(5)

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
index c287196302f69..f91b5d6c2cbfe 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
@@ -1,7 +1,6 @@
-; RUN: opt -S -mtriple=amdgcn- -passes=sroa %s -o %t.sroa.ll
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %t.sroa.ll | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-ALLOCA %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %t.sroa.ll | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-PROMOTE %s
-; RUN: opt -S -mtriple=amdgcn-- -passes='sroa,amdgpu-promote-alloca,instcombine' < %s | FileCheck -check-prefix=OPT %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-ALLOCA %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-PROMOTE %s
+; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s
 
 target datalayout = "A5"
 
@@ -76,7 +75,8 @@ entry:
 ; OPT-LABEL: @vector_write_read_bitcast_to_float(
 ; OPT-NOT:   alloca
 ; OPT: bb2:
-; OPT:  %0 = insertelement <6 x float> undef, float %tmp71, i32 %tmp10
+; OPT:  %tmp.sroa.0.0 = phi <6 x float> [ undef, %bb ], [ %0, %bb2 ]
+; OPT:  %0 = insertelement <6 x float> %tmp.sroa.0.0, float %tmp72, i32 %tmp10
 ; OPT: .preheader:
 ; OPT:  %bc = bitcast <6 x float> %0 to <6 x i32>
 ; OPT:  %1 = extractelement <6 x i32> %bc, i32 %tmp20
@@ -84,13 +84,24 @@ entry:
 ; GCN-LABEL: {{^}}vector_write_read_bitcast_to_float:
 ; GCN-ALLOCA: buffer_store_dword
 
-; GCN-PROMOTE: v_cmp_eq_u16
-; GCN-PROMOTE: v_cndmask
+; GCN-PROMOTE-COUNT-6: v_cmp_eq_u16
+; GCN-PROMOTE-COUNT-6: v_cndmask
 
 ; GCN: s_cbranch
 
 ; GCN-ALLOCA: buffer_load_dword
 
+; GCN-PROMOTE: v_cmp_eq_u16
+; GCN-PROMOTE: v_cndmask
+; GCN-PROMOTE: v_cmp_eq_u16
+; GCN-PROMOTE: v_cndmask
+; GCN-PROMOTE: v_cmp_eq_u16
+; GCN-PROMOTE: v_cndmask
+; GCN-PROMOTE: v_cmp_eq_u16
+; GCN-PROMOTE: v_cndmask
+; GCN-PROMOTE: v_cmp_eq_u16
+; GCN-PROMOTE: v_cndmask
+
 ; GCN-PROMOTE: ScratchSize: 0
 
 define amdgpu_kernel void @vector_write_read_bitcast_to_float(ptr addrspace(1) %arg) {
@@ -136,7 +147,8 @@ bb15:                                             ; preds = %.preheader
 ; OPT-LABEL: @vector_write_read_bitcast_to_double(
 ; OPT-NOT:   alloca
 ; OPT: bb2:
-; OPT:  %0 = insertelement <6 x double> undef, double %tmp71, i32 %tmp10
+; OPT:  %tmp.sroa.0.0 = phi <6 x double> [ undef, %bb ], [ %0, %bb2 ]
+; OPT:  %0 = insertelement <6 x double> %tmp.sroa.0.0, double %tmp72, i32 %tmp10
 ; OPT: .preheader:
 ; OPT:  %bc = bitcast <6 x double> %0 to <6 x i64>
 ; OPT:  %1 = extractelement <6 x i64> %bc, i32 %tmp20
@@ -196,7 +208,8 @@ bb15:                                             ; preds = %.preheader
 ; OPT-LABEL: @vector_write_read_bitcast_to_i64(
 ; OPT-NOT:   alloca
 ; OPT: bb2:
-; OPT:  %0 = insertelement <6 x i64> undef, i64 %tmp6, i32 %tmp9
+; OPT:  %tmp.sroa.0.0 = phi <6 x i64> [ undef, %bb ], [ %0, %bb2 ]
+; OPT:  %0 = insertelement <6 x i64> %tmp.sroa.0.0, i64 %tmp6, i32 %tmp9
 ; OPT: .preheader:
 ; OPT:  %1 = extractelement <6 x i64> %0, i32 %tmp18
 
@@ -259,7 +272,7 @@ bb13:                                             ; preds = %.preheader
 ; OPT: store i32 %0, ptr addrspace(1) %out, align 4
 
 ; GCN-LABEL: {{^}}vector_read_alloca_bitcast_assume:
-; GCN-COUNT: buffer_store_dword
+; GCN-COUNT-4: buffer_store_dword
 
 define amdgpu_kernel void @vector_read_alloca_bitcast_assume(ptr addrspace(1) %out, i32 %index) {
 entry: