[llvm] 091bfa7 - [AMDGPU] Use SSAUpdater in PromoteAlloca

Tue Jun 27 23:12:27 PDT 2023

Author: pvanhout
Date: 2023-06-28T08:12:22+02:00
New Revision: 091bfa76db64fbe96d0e53d99b2068cc05f6aa16

URL: https://github.com/llvm/llvm-project/commit/091bfa76db64fbe96d0e53d99b2068cc05f6aa16
DIFF: https://github.com/llvm/llvm-project/commit/091bfa76db64fbe96d0e53d99b2068cc05f6aa16.diff

LOG: [AMDGPU] Use SSAUpdater in PromoteAlloca

This allows PromoteAlloca to not be reliant on a second SROA run to remove the alloca completely. It just does the full transformation directly.

Note PromoteAlloca is still reliant on SROA running first to
canonicalize the IR. For instance, PromoteAlloca will no longer handle aggregate types because those should be simplified by SROA before reaching the pass.

Reviewed By: #amdgpu, arsenm

Differential Revision: https://reviews.llvm.org/D152706

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
    llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
    llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
    llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index cd289e6470f2c..2dfb57792da5d 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -28,7 +28,9 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/InstSimplifyFolder.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IRBuilder.h"
@@ -38,6 +40,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 
 #define DEBUG_TYPE "amdgpu-promote-alloca"
 
@@ -45,20 +48,20 @@ using namespace llvm;
 
 namespace {
 
-static cl::opt<bool> DisablePromoteAllocaToVector(
-  "disable-promote-alloca-to-vector",
-  cl::desc("Disable promote alloca to vector"),
-  cl::init(false));
+static cl::opt<bool>
+    DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
+                                 cl::desc("Disable promote alloca to vector"),
+                                 cl::init(false));
 
-static cl::opt<bool> DisablePromoteAllocaToLDS(
-  "disable-promote-alloca-to-lds",
-  cl::desc("Disable promote alloca to LDS"),
-  cl::init(false));
+static cl::opt<bool>
+    DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
+                              cl::desc("Disable promote alloca to LDS"),
+                              cl::init(false));
 
 static cl::opt<unsigned> PromoteAllocaToVectorLimit(
-  "amdgpu-promote-alloca-to-vector-limit",
-  cl::desc("Maximum byte size to consider promote alloca to vector"),
-  cl::init(0));
+    "amdgpu-promote-alloca-to-vector-limit",
+    cl::desc("Maximum byte size to consider promote alloca to vector"),
+    cl::init(0));
 
 // Shared implementation which can do both promotion to vector and to LDS.
 class AMDGPUPromoteAllocaImpl {
@@ -80,17 +83,16 @@ class AMDGPUPromoteAllocaImpl {
 
   /// BaseAlloca is the alloca root the search started from.
   /// Val may be that alloca or a recursive user of it.
-  bool collectUsesWithPtrTypes(Value *BaseAlloca,
-                               Value *Val,
-                               std::vector<Value*> &WorkList) const;
+  bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val,
+                               std::vector<Value *> &WorkList) const;
 
   /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
   /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
   /// Returns true if both operands are derived from the same alloca. Val should
   /// be the same value as one of the input operands of UseInst.
   bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
-                                       Instruction *UseInst,
-                                       int OpIdx0, int OpIdx1) const;
+                                       Instruction *UseInst, int OpIdx0,
+                                       int OpIdx1) const;
 
   /// Check whether we have enough local memory for promotion.
   bool hasSufficientLocalMem(const Function &F);
@@ -253,6 +255,10 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
       Changed = true;
   }
 
+  // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
+  // dangling pointers. If we want to reuse it past this point, the loop above
+  // would need to be updated to remove successfully promoted allocas.
+
   return Changed;
 }
 
@@ -269,6 +275,10 @@ static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI,
   using namespace PatternMatch;
   // For now we only care about non-volatile memsets that affect the whole type
   // (start at index 0 and fill the whole alloca).
+  //
+  // TODO: Now that we moved to PromoteAlloca we could handle any memsets
+  // (except maybe volatile ones?) - we just need to use shufflevector if it
+  // only affects a subset of the vector.
   const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType());
   return I->getOperand(0) == AI &&
          match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
@@ -319,6 +329,107 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
   return ConstantInt::get(GEP->getContext(), Quot);
 }
 
+void promoteAllocaUserToVector(
+    Instruction *Inst, const DataLayout &DL, SSAUpdater &Updater,
+    FixedVectorType *VectorTy, unsigned VecStoreSize, unsigned ElementSize,
+    DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo,
+    std::map<GetElementPtrInst *, Value *> &GEPVectorIdx) {
+  // Note: we use InstSimplifyFolder because it can leverage the DataLayout
+  // to do more folding, especially in the case of vector splats.
+  IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
+                                        InstSimplifyFolder(DL));
+  Builder.SetInsertPoint(Inst);
+
+  Type *VecEltTy = VectorTy->getElementType();
+
+  switch (Inst->getOpcode()) {
+  case Instruction::Load: {
+    Value *Vec = Updater.GetValueAtEndOfBlock(Inst->getParent());
+    Value *Index = calculateVectorIndex(
+        cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx);
+
+    // loading the full vector
+    if (DL.getTypeStoreSize(Inst->getType()) == VecStoreSize) {
+      assert(cast<Constant>(Index)->isZeroValue());
+
+      Value *NewVal = Builder.CreateBitCast(Vec, Inst->getType());
+      Inst->replaceAllUsesWith(NewVal);
+      break;
+    }
+
+    Value *ExtractElement = Builder.CreateExtractElement(Vec, Index);
+    if (Inst->getType() != VecEltTy)
+      ExtractElement =
+          Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType());
+
+    Inst->replaceAllUsesWith(ExtractElement);
+    break;
+  }
+  case Instruction::Store: {
+    StoreInst *SI = cast<StoreInst>(Inst);
+    Value *Vec = Updater.GetValueAtEndOfBlock(Inst->getParent());
+    Value *Index = calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx);
+    Value *Elt = SI->getValueOperand();
+
+    // Storing the full vector
+    if (DL.getTypeStoreSize(Elt->getType()) == VecStoreSize) {
+      assert(cast<Constant>(Index)->isZeroValue());
+      Updater.AddAvailableValue(Inst->getParent(),
+                                Builder.CreateBitCast(Elt, VectorTy));
+      break;
+    }
+
+    if (Elt->getType() != VecEltTy)
+      Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy);
+    Value *NewVec = Builder.CreateInsertElement(Vec, Elt, Index);
+
+    Updater.AddAvailableValue(Inst->getParent(), NewVec);
+    break;
+  }
+  case Instruction::Call: {
+    if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst)) {
+      ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
+      unsigned NumCopied = Length->getZExtValue() / ElementSize;
+      MemTransferInfo *TI = &TransferInfo[cast<MemTransferInst>(Inst)];
+      unsigned SrcBegin = TI->SrcIndex->getZExtValue();
+      unsigned DestBegin = TI->DestIndex->getZExtValue();
+
+      SmallVector<int> Mask;
+      for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
+        if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
+          Mask.push_back(SrcBegin++);
+        } else {
+          Mask.push_back(Idx);
+        }
+      }
+
+      Value *Vec = Updater.GetValueAtEndOfBlock(Inst->getParent());
+      Value *NewVec = Builder.CreateShuffleVector(Vec, Mask);
+
+      Updater.AddAvailableValue(Inst->getParent(), NewVec);
+    } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
+      Value *Elt = MSI->getOperand(1);
+      if (DL.getTypeStoreSize(VecEltTy) > 1) {
+        Value *EltBytes =
+            Builder.CreateVectorSplat(DL.getTypeStoreSize(VecEltTy), Elt);
+        Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
+      }
+
+      Value *Splat =
+          Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt);
+
+      Updater.AddAvailableValue(Inst->getParent(), Splat);
+    } else {
+      llvm_unreachable("Unsupported call when promoting alloca to vector");
+    }
+    break;
+  }
+
+  default:
+    llvm_unreachable("Inconsistency in instructions promotable to vector");
+  }
+}
+
 // FIXME: Should try to pick the most likely to be profitable allocas first.
 bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
@@ -365,6 +476,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
 
   std::map<GetElementPtrInst *, Value *> GEPVectorIdx;
   SmallVector<Instruction *> WorkList;
+  SmallVector<Instruction *> UsersToRemove;
   SmallVector<Instruction *> DeferredInsts;
   SmallVector<Use *, 8> Uses;
   DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
@@ -393,12 +505,18 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
         return RejectUser(Inst, "pointer is being stored");
 
       Type *AccessTy = getLoadStoreType(Inst);
+      if (AccessTy->isAggregateType())
+        return RejectUser(Inst, "unsupported load/store as aggregate");
+      assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
+
       Ptr = Ptr->stripPointerCasts();
 
-      // Alloca already accessed as vector, leave alone.
+      // Alloca already accessed as vector.
       if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) ==
-                                DL->getTypeStoreSize(AccessTy))
+                                DL->getTypeStoreSize(AccessTy)) {
+        WorkList.push_back(Inst);
         continue;
+      }
 
       // Check that this is a simple access of a vector element.
       bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
@@ -416,6 +534,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       // Look through bitcasts.
       for (Use &U : Inst->uses())
         Uses.push_back(&U);
+      UsersToRemove.push_back(Inst);
       continue;
     }
 
@@ -429,6 +548,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       GEPVectorIdx[GEP] = Index;
       for (Use &U : Inst->uses())
         Uses.push_back(&U);
+      UsersToRemove.push_back(Inst);
       continue;
     }
 
@@ -481,13 +601,17 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
     }
 
     // Ignore assume-like intrinsics and comparisons used in assumes.
-    if (isAssumeLikeIntrinsic(Inst))
+    if (isAssumeLikeIntrinsic(Inst)) {
+      UsersToRemove.push_back(Inst);
       continue;
+    }
 
     if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
           return isAssumeLikeIntrinsic(cast<Instruction>(U));
-        }))
+        })) {
+      UsersToRemove.push_back(Inst);
       continue;
+    }
 
     return RejectUser(Inst, "unhandled alloca user");
   }
@@ -506,80 +630,66 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
                     << *VectorTy << '\n');
 
-  for (Instruction *Inst : WorkList) {
-    IRBuilder<> Builder(Inst);
-    switch (Inst->getOpcode()) {
-    case Instruction::Load: {
-      Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
-      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
-      Type *VecPtrTy = VectorTy->getPointerTo(Alloca.getAddressSpace());
-      Value *BitCast = Builder.CreateBitCast(&Alloca, VecPtrTy);
-      Value *VecValue =
-          Builder.CreateAlignedLoad(VectorTy, BitCast, Alloca.getAlign());
-      Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
-      if (Inst->getType() != VecEltTy)
-        ExtractElement =
-            Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType());
-      Inst->replaceAllUsesWith(ExtractElement);
-      Inst->eraseFromParent();
-      break;
-    }
-    case Instruction::Store: {
-      StoreInst *SI = cast<StoreInst>(Inst);
-      Value *Ptr = SI->getPointerOperand();
-      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
-      Type *VecPtrTy = VectorTy->getPointerTo(Alloca.getAddressSpace());
-      Value *BitCast = Builder.CreateBitCast(&Alloca, VecPtrTy);
-      Value *VecValue =
-          Builder.CreateAlignedLoad(VectorTy, BitCast, Alloca.getAlign());
-      Value *Elt = SI->getValueOperand();
-      if (Elt->getType() != VecEltTy)
-        Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy);
-      Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index);
-      Builder.CreateAlignedStore(NewVecValue, BitCast, Alloca.getAlign());
-      Inst->eraseFromParent();
-      break;
-    }
-    case Instruction::Call: {
-      if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst)) {
-        ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
-        unsigned NumCopied = Length->getZExtValue() / ElementSize;
-        MemTransferInfo *TI = &TransferInfo[cast<MemTransferInst>(Inst)];
-        unsigned SrcBegin = TI->SrcIndex->getZExtValue();
-        unsigned DestBegin = TI->DestIndex->getZExtValue();
-
-        SmallVector<int> Mask;
-        for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
-          if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
-            Mask.push_back(SrcBegin++);
-          } else {
-            Mask.push_back(Idx);
-          }
-        }
-        Type *VecPtrTy = VectorTy->getPointerTo(Alloca.getAddressSpace());
-        Value *BitCast = Builder.CreateBitCast(&Alloca, VecPtrTy);
-        Value *VecValue =
-            Builder.CreateAlignedLoad(VectorTy, BitCast, Alloca.getAlign());
-        Value *NewVecValue = Builder.CreateShuffleVector(VecValue, Mask);
-        Builder.CreateAlignedStore(NewVecValue, BitCast, Alloca.getAlign());
-
-        Inst->eraseFromParent();
-      } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
-        // Ensure the length parameter of the memsets matches the new vector
-        // type's. In general, the type size shouldn't change so this is a
-        // no-op, but it's better to be safe.
-        MSI->setOperand(2, Builder.getInt64(DL->getTypeStoreSize(VectorTy)));
-      } else {
-        llvm_unreachable("Unsupported call when promoting alloca to vector");
-      }
-      break;
+  SSAUpdater Updater;
+  Updater.Initialize(VectorTy, "promotealloca");
+
+  // alloca is uninitialized memory. Imitate that by making the first value
+  // undef.
+  Updater.AddAvailableValue(Alloca.getParent(), UndefValue::get(VectorTy));
+
+  // Bucket up uses of the alloca by the block they occur in.
+  // This is important because we have to handle multiple defs/uses in a block
+  // ourselves: SSAUpdater is purely for cross-block references.
+  DenseMap<BasicBlock *, SmallDenseSet<Instruction *>> UsesByBlock;
+  for (Instruction *User : WorkList)
+    UsesByBlock[User->getParent()].insert(User);
+
+  const unsigned VecStoreSize = DL->getTypeStoreSize(VectorTy);
+  for (Instruction *User : WorkList) {
+    BasicBlock *BB = User->getParent();
+    auto &BlockUses = UsesByBlock[BB];
+
+    // Already processed, skip.
+    if (BlockUses.empty())
+      continue;
+
+    // Only user in the block, directly process it.
+    if (BlockUses.size() == 1) {
+      promoteAllocaUserToVector(User, *DL, Updater, VectorTy, VecStoreSize,
+                                ElementSize, TransferInfo, GEPVectorIdx);
+      continue;
     }
 
-    default:
-      llvm_unreachable("Inconsistency in instructions promotable to vector");
+    // Multiple users in the block, do a linear scan to promote users in order.
+    for (Instruction &Inst : *BB) {
+      if (!BlockUses.contains(&Inst))
+        continue;
+
+      promoteAllocaUserToVector(&Inst, *DL, Updater, VectorTy, VecStoreSize,
+                                ElementSize, TransferInfo, GEPVectorIdx);
     }
+
+    // Clear the block so we know it's been processed.
+    BlockUses.clear();
+  }
+
+  // Delete worklist instructions
+  for (Instruction *I : WorkList) {
+    assert(I->use_empty());
+    I->eraseFromParent();
   }
 
+  // Delete all the users that are known to be removeable.
+  for (Instruction *I : reverse(UsersToRemove)) {
+    I->dropDroppableUses();
+    assert(I->use_empty());
+    I->eraseFromParent();
+  }
+
+  // Alloca should now be dead too.
+  assert(Alloca.use_empty());
+  Alloca.eraseFromParent();
+
   return true;
 }
 
@@ -1067,7 +1177,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
 
   CurrentLocalMemUsage = NewSize;
 
-  std::vector<Value*> WorkList;
+  std::vector<Value *> WorkList;
 
   if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
     LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
@@ -1100,9 +1210,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
   TID = Builder.CreateAdd(TID, TIdZ);
 
   Value *Indices[] = {
-    Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
-    TID
-  };
+      Constant::getNullValue(Type::getInt32Ty(Mod->getContext())), TID};
 
   Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
   I.mutateType(Offset->getType());
@@ -1216,10 +1324,9 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
     assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
 
     MemTransferInst *MI = cast<MemTransferInst>(Intr);
-    auto *B =
-      Builder.CreateMemTransferInst(ID, MI->getRawDest(), MI->getDestAlign(),
-                                    MI->getRawSource(), MI->getSourceAlign(),
-                                    MI->getLength(), MI->isVolatile());
+    auto *B = Builder.CreateMemTransferInst(
+        ID, MI->getRawDest(), MI->getDestAlign(), MI->getRawSource(),
+        MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
 
     for (unsigned I = 0; I != 2; ++I) {
       if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {

diff  --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
index 97383490841e5..4da6f0e446689 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
@@ -13,38 +13,21 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s16, s33
-; GCN-NEXT:    s_add_i32 s33, s32, 0xfc0
-; GCN-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
-; GCN-NEXT:    s_addk_i32 s32, 0x3000
+; GCN-NEXT:    s_addk_i32 s32, 0x800
 ; GCN-NEXT:    v_writelane_b32 v43, s16, 0
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, extern_func at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, extern_func at gotpcrel32@hi+12
 ; GCN-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    v_writelane_b32 v42, s30, 0
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:92
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:88
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:84
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:80
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:76
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:72
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:68
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:64
-; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, v8
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    v_writelane_b32 v42, s31, 1
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
@@ -58,10 +41,10 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 {
 ; GCN-NEXT:    v_readlane_b32 s30, v42, 0
 ; GCN-NEXT:    v_readlane_b32 s4, v43, 0
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    s_addk_i32 s32, 0xd000
+; GCN-NEXT:    s_addk_i32 s32, 0xf800
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
index 785ac5ad51a04..68737cb227a00 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=sroa,amdgpu-promote-alloca < %s | FileCheck %s
 
 ; Make sure that array alloca loaded and stored as multi-element aggregates are handled correctly
 ; Strictly the promote-alloca pass shouldn't have to deal with this case as it is non-canonical, but
@@ -16,19 +16,16 @@
 
 define amdgpu_vs void @promote_1d_aggr() #0 {
 ; CHECK-LABEL: @promote_1d_aggr(
-; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
 ; CHECK-NEXT:    [[F1:%.*]] = alloca [1 x float], align 4, addrspace(5)
 ; CHECK-NEXT:    [[FOO:%.*]] = getelementptr [[BLOCK:%.*]], ptr addrspace(1) @block, i32 0, i32 1
 ; CHECK-NEXT:    [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
-; CHECK-NEXT:    store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = load [1 x float], ptr addrspace(1) @block, align 4
-; CHECK-NEXT:    store [1 x float] [[FOO3]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(5) [[I]], align 4
-; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
+; CHECK-NEXT:    [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [1 x float] [[FOO3]], 0
+; CHECK-NEXT:    [[FOO3_FCA_0_GEP:%.*]] = getelementptr inbounds [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 0
+; CHECK-NEXT:    store float [[FOO3_FCA_0_EXTRACT]], ptr addrspace(5) [[FOO3_FCA_0_GEP]], align 4
+; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO1]]
 ; CHECK-NEXT:    [[FOO6:%.*]] = load float, ptr addrspace(5) [[FOO5]], align 4
-; CHECK-NEXT:    [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5)
-; CHECK-NEXT:    [[FOO8:%.*]] = load <4 x float>, ptr addrspace(5) [[FOO7]], align 16
-; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[FOO6]], i32 0
+; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> undef, float [[FOO6]], i32 0
 ; CHECK-NEXT:    [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[FOO6]], i32 1
 ; CHECK-NEXT:    [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[FOO6]], i32 2
 ; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[FOO6]], i32 3
@@ -60,22 +57,12 @@ define amdgpu_vs void @promote_1d_aggr() #0 {
 
 define amdgpu_vs void @promote_store_aggr() #0 {
 ; CHECK-LABEL: @promote_store_aggr(
-; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
-; CHECK-NEXT:    [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5)
 ; CHECK-NEXT:    [[FOO1:%.*]] = load i32, ptr addrspace(1) @block2, align 4
-; CHECK-NEXT:    store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4
-; CHECK-NEXT:    [[FOO2:%.*]] = load i32, ptr addrspace(5) [[I]], align 4
-; CHECK-NEXT:    [[FOO3:%.*]] = sitofp i32 [[FOO2]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3]], i32 0
-; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [2 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float 2.000000e+00, i64 1
-; CHECK-NEXT:    store <2 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO6:%.*]] = load [2 x float], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[FOO3:%.*]] = sitofp i32 [[FOO1]] to float
+; CHECK-NEXT:    [[FOO6_FCA_0_INSERT:%.*]] = insertvalue [2 x float] poison, float [[FOO3]], 0
+; CHECK-NEXT:    [[FOO6_FCA_1_INSERT:%.*]] = insertvalue [2 x float] [[FOO6_FCA_0_INSERT]], float 2.000000e+00, 1
 ; CHECK-NEXT:    [[FOO7:%.*]] = getelementptr [[BLOCK2:%.*]], ptr addrspace(1) @block2, i32 0, i32 1
-; CHECK-NEXT:    store [2 x float] [[FOO6]], ptr addrspace(1) [[FOO7]], align 4
+; CHECK-NEXT:    store [2 x float] [[FOO6_FCA_1_INSERT]], ptr addrspace(1) [[FOO7]], align 4
 ; CHECK-NEXT:    store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, ptr addrspace(1) @pv, align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -100,23 +87,18 @@ define amdgpu_vs void @promote_store_aggr() #0 {
 
 define amdgpu_vs void @promote_load_from_store_aggr() #0 {
 ; CHECK-LABEL: @promote_load_from_store_aggr(
-; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
-; CHECK-NEXT:    [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5)
 ; CHECK-NEXT:    [[FOO:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 1
 ; CHECK-NEXT:    [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
-; CHECK-NEXT:    store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = load [2 x float], ptr addrspace(1) @block3, align 4
-; CHECK-NEXT:    store [2 x float] [[FOO3]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(5) [[I]], align 4
-; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [2 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 [[FOO4]]
-; CHECK-NEXT:    [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5)
-; CHECK-NEXT:    [[FOO8:%.*]] = load <4 x float>, ptr addrspace(5) [[FOO7]], align 16
-; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[TMP2]], i32 0
-; CHECK-NEXT:    [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP2]], i32 2
-; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP2]], i32 3
+; CHECK-NEXT:    [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float [[FOO3_FCA_0_EXTRACT]], i32 0
+; CHECK-NEXT:    [[FOO3_FCA_1_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3_FCA_1_EXTRACT]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO1]]
+; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
+; CHECK-NEXT:    [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP3]], i32 3
 ; CHECK-NEXT:    store <4 x float> [[FOO12]], ptr addrspace(1) @pv, align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -142,22 +124,7 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 {
 
 define amdgpu_vs void @promote_memmove_aggr() #0 {
 ; CHECK-LABEL: @promote_memmove_aggr(
-; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
-; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO1:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 1.000000e+00, i64 1
-; CHECK-NEXT:    store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
-; CHECK-NEXT:    [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 2.000000e+00, i64 3
-; CHECK-NEXT:    store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 4>
-; CHECK-NEXT:    store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0
-; CHECK-NEXT:    store float [[TMP8]], ptr addrspace(1) @pv, align 4
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) @pv, align 4
 ; CHECK-NEXT:    ret void
 ;
   %f1 = alloca [5 x float], addrspace(5)
@@ -174,24 +141,12 @@ define amdgpu_vs void @promote_memmove_aggr() #0 {
 
 define amdgpu_vs void @promote_memcpy_aggr() #0 {
 ; CHECK-LABEL: @promote_memcpy_aggr(
-; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
-; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
-; CHECK-NEXT:    [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 2.000000e+00, i64 3
-; CHECK-NEXT:    store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
 ; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
-; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 3.000000e+00, i32 [[FOO4]]
-; CHECK-NEXT:    store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
-; CHECK-NEXT:    store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0
-; CHECK-NEXT:    store float [[TMP8]], ptr addrspace(1) @pv, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <5 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 2.000000e+00, float 0.000000e+00>, float 3.000000e+00, i32 [[FOO4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    store float [[TMP3]], ptr addrspace(1) @pv, align 4
 ; CHECK-NEXT:    ret void
 ;
   %f1 = alloca [5 x float], addrspace(5)
@@ -213,22 +168,7 @@ define amdgpu_vs void @promote_memcpy_aggr() #0 {
 
 define amdgpu_vs void @promote_memcpy_identity_aggr() #0 {
 ; CHECK-LABEL: @promote_memcpy_identity_aggr(
-; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
-; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO1:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 1.000000e+00, i64 1
-; CHECK-NEXT:    store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
-; CHECK-NEXT:    [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 2.000000e+00, i64 3
-; CHECK-NEXT:    store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4>
-; CHECK-NEXT:    store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0
-; CHECK-NEXT:    store float [[TMP8]], ptr addrspace(1) @pv, align 4
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) @pv, align 4
 ; CHECK-NEXT:    ret void
 ;
   %f1 = alloca [5 x float], addrspace(5)
@@ -248,8 +188,26 @@ define amdgpu_vs void @promote_memcpy_two_aggrs() #0 {
 ; CHECK-LABEL: @promote_memcpy_two_aggrs(
 ; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
 ; CHECK-NEXT:    [[F2:%.*]] = alloca [5 x float], align 4, addrspace(5)
-; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F2]], align 4
+; CHECK-NEXT:    [[DOTFCA_0_GEP1:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 0
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP1]], align 4
+; CHECK-NEXT:    [[DOTFCA_1_GEP2:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP2]], align 4
+; CHECK-NEXT:    [[DOTFCA_2_GEP3:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 2
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP3]], align 4
+; CHECK-NEXT:    [[DOTFCA_3_GEP4:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP4]], align 4
+; CHECK-NEXT:    [[DOTFCA_4_GEP5:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 4
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP5]], align 4
+; CHECK-NEXT:    [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 0
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP]], align 4
+; CHECK-NEXT:    [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 1
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP]], align 4
+; CHECK-NEXT:    [[DOTFCA_2_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 2
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP]], align 4
+; CHECK-NEXT:    [[DOTFCA_3_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 3
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP]], align 4
+; CHECK-NEXT:    [[DOTFCA_4_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 4
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP]], align 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
 ; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
 ; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
@@ -283,7 +241,16 @@ define amdgpu_vs void @promote_memcpy_two_aggrs() #0 {
 define amdgpu_vs void @promote_memcpy_p1p5_aggr(ptr addrspace(1) inreg %src) #0 {
 ; CHECK-LABEL: @promote_memcpy_p1p5_aggr(
 ; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
-; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 0
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP]], align 4
+; CHECK-NEXT:    [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP]], align 4
+; CHECK-NEXT:    [[DOTFCA_2_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 2
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP]], align 4
+; CHECK-NEXT:    [[DOTFCA_3_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP]], align 4
+; CHECK-NEXT:    [[DOTFCA_4_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 4
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP]], align 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
 ; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
 ; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
@@ -305,21 +272,12 @@ define amdgpu_vs void @promote_memcpy_p1p5_aggr(ptr addrspace(1) inreg %src) #0
 
 define amdgpu_vs void @promote_memcpy_inline_aggr() #0 {
 ; CHECK-LABEL: @promote_memcpy_inline_aggr(
-; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
-; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
 ; CHECK-NEXT:    [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
 ; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
-; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 3.000000e+00, i32 [[FOO4]]
-; CHECK-NEXT:    store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <5 x float> [[TMP3]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
-; CHECK-NEXT:    store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <5 x float> [[TMP5]], i32 0
-; CHECK-NEXT:    store float [[TMP6]], ptr addrspace(1) @pv, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <5 x float> zeroinitializer, float 3.000000e+00, i32 [[FOO4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    store float [[TMP3]], ptr addrspace(1) @pv, align 4
 ; CHECK-NEXT:    ret void
 ;
   %f1 = alloca [5 x float], addrspace(5)
@@ -347,30 +305,16 @@ declare void @llvm.memmove.p5i8.p5i8.i32(ptr addrspace(5) nocapture writeonly, p
 
 define amdgpu_ps void @promote_double_aggr() #0 {
 ; CHECK-LABEL: @promote_double_aggr(
-; CHECK-NEXT:    [[S:%.*]] = alloca [2 x double], align 8, addrspace(5)
 ; CHECK-NEXT:    [[FOO:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0
 ; CHECK-NEXT:    [[FOO1:%.*]] = load double, ptr addrspace(1) [[FOO]], align 8
 ; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 1
 ; CHECK-NEXT:    [[FOO3:%.*]] = load double, ptr addrspace(1) [[FOO2]], align 8
 ; CHECK-NEXT:    [[FOO4:%.*]] = insertvalue [2 x double] undef, double [[FOO1]], 0
 ; CHECK-NEXT:    [[FOO5:%.*]] = insertvalue [2 x double] [[FOO4]], double [[FOO3]], 1
-; CHECK-NEXT:    store [2 x double] [[FOO5]], ptr addrspace(5) [[S]], align 8
-; CHECK-NEXT:    [[FOO6:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i64 1
-; CHECK-NEXT:    [[FOO8:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i64 1
-; CHECK-NEXT:    [[FOO10:%.*]] = fadd double [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[FOO10]], i32 0
-; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr addrspace(5) [[S]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
-; CHECK-NEXT:    [[FOO14:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i64 1
-; CHECK-NEXT:    [[FOO16:%.*]] = fadd double [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[FOO5_FCA_0_EXTRACT:%.*]] = extractvalue [2 x double] [[FOO5]], 0
+; CHECK-NEXT:    [[FOO5_FCA_1_EXTRACT:%.*]] = extractvalue [2 x double] [[FOO5]], 1
+; CHECK-NEXT:    [[FOO10:%.*]] = fadd double [[FOO5_FCA_1_EXTRACT]], [[FOO5_FCA_1_EXTRACT]]
+; CHECK-NEXT:    [[FOO16:%.*]] = fadd double [[FOO10]], [[FOO5_FCA_1_EXTRACT]]
 ; CHECK-NEXT:    [[FOO17:%.*]] = fptrunc double [[FOO16]] to float
 ; CHECK-NEXT:    [[FOO18:%.*]] = insertelement <4 x float> undef, float [[FOO17]], i32 0
 ; CHECK-NEXT:    [[FOO19:%.*]] = insertelement <4 x float> [[FOO18]], float [[FOO17]], i32 1
@@ -410,21 +354,6 @@ define amdgpu_ps void @promote_double_aggr() #0 {
 define amdgpu_kernel void @alloca_struct() #0 {
 ; CHECK-LABEL: @alloca_struct(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load !0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1:![0-9]+]], !invariant.load !0
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2:![0-9]+]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.z(), !range [[RNG2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x [2 x %struct]], ptr addrspace(3) @alloca_struct.alloca, i32 0, i32 [[TMP13]]
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
index 3596c96b8cd79..4cec3bd41ce2f 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
@@ -6,7 +6,7 @@
 @global_array1 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4
 
 ; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) {
-; IR: alloca [10 x i32]
+; IR-NOT: alloca [10 x i32]
 ; ASM-LABEL: {{^}}promote_alloca_size_256:
 ; ASM: .amdgpu_lds llvm.amdgcn.kernel.promote_alloca_size_256.lds, 60000, 16
 ; ASM-NOT: .amdgpu_lds

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
index f31421de517cb..a99c01edcc12d 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
@@ -1,19 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca,sroa < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
 
 ; Checks that memsets don't block PromoteAlloca.
 
-; Note: memsets are just updated with the new type size. They are not eliminated which means
-; the original alloca also stay. This puts a bit more load on SROA.
-; If PromoteAlloca is moved to SSAUpdater, we could just entirely replace the memsets with
-; e.g. ConstantAggregate.
-
 define amdgpu_kernel void @memset_all_zero(i64 %val) {
 ; CHECK-LABEL: @memset_all_zero(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <6 x i64> zeroinitializer, i64 [[VAL:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <6 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i64 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -30,8 +24,7 @@ define amdgpu_kernel void @memset_all_5(i64 %val) {
 ; CHECK-LABEL: @memset_all_5(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i64> <i64 361700864190383365, i64 361700864190383365, i64 361700864190383365, i64 361700864190383365>, i64 [[VAL:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i64 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -47,11 +40,9 @@ entry:
 define amdgpu_kernel void @memset_volatile_nopromote(i64 %val) {
 ; CHECK-LABEL: @memset_volatile_nopromote(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[STACK_SROA_0:%.*]] = alloca i64, align 8, addrspace(5)
-; CHECK-NEXT:    [[STACK_SROA_2:%.*]] = alloca [3 x i64], align 8, addrspace(5)
-; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_0]], i8 0, i64 8, i1 true)
-; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_2]], i8 0, i64 24, i1 true)
-; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK_SROA_0]], align 8
+; CHECK-NEXT:    [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 32, i1 true)
+; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -64,11 +55,9 @@ entry:
 define amdgpu_kernel void @memset_badsize_nopromote(i64 %val) {
 ; CHECK-LABEL: @memset_badsize_nopromote(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[STACK_SROA_0:%.*]] = alloca i64, align 8, addrspace(5)
-; CHECK-NEXT:    [[STACK_SROA_2:%.*]] = alloca [23 x i8], align 4, addrspace(5)
-; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_0]], i8 0, i64 8, i1 true)
-; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 4 [[STACK_SROA_2]], i8 0, i64 23, i1 true)
-; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK_SROA_0]], align 8
+; CHECK-NEXT:    [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 31, i1 true)
+; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -81,8 +70,10 @@ entry:
 define amdgpu_kernel void @memset_offset_ptr_nopromote(i64 %val) {
 ; CHECK-LABEL: @memset_offset_ptr_nopromote(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[STACK_SROA_1:%.*]] = alloca [3 x i64], align 8, addrspace(5)
-; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_1]], i8 0, i64 24, i1 true)
+; CHECK-NEXT:    [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [4 x i64], ptr addrspace(5) [[STACK]], i64 0, i64 1
+; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[GEP]], i8 0, i64 24, i1 true)
+; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll
index 70b4e94f36c07..8df15e3f7e29a 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll
@@ -4,15 +4,10 @@
 define i64 @test_pointer_array(i64 %v) {
 ; OPT-LABEL: @test_pointer_array(
 ; OPT-NEXT:  entry:
-; OPT-NEXT:    [[A:%.*]] = alloca [3 x ptr], align 16, addrspace(5)
-; OPT-NEXT:    [[TMP0:%.*]] = load <3 x ptr>, ptr addrspace(5) [[A]], align 16
-; OPT-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[V:%.*]] to ptr
-; OPT-NEXT:    [[TMP2:%.*]] = insertelement <3 x ptr> [[TMP0]], ptr [[TMP1]], i32 0
-; OPT-NEXT:    store <3 x ptr> [[TMP2]], ptr addrspace(5) [[A]], align 16
-; OPT-NEXT:    [[TMP3:%.*]] = load <3 x ptr>, ptr addrspace(5) [[A]], align 16
-; OPT-NEXT:    [[TMP4:%.*]] = extractelement <3 x ptr> [[TMP3]], i32 0
-; OPT-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64
-; OPT-NEXT:    ret i64 [[TMP5]]
+; OPT-NEXT:    [[TMP0:%.*]] = inttoptr i64 [[V:%.*]] to ptr
+; OPT-NEXT:    [[TMP1:%.*]] = insertelement <3 x ptr> undef, ptr [[TMP0]], i32 0
+; OPT-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; OPT-NEXT:    ret i64 [[TMP2]]
 ;
 entry:
   %a = alloca [3 x ptr], align 16, addrspace(5)

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
index adabeab379505..083ed999ac371 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -passes=amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -passes=sroa,amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
 
 ; GCN-LABEL: {{^}}float4_alloca_store4:
 ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4
@@ -11,11 +11,8 @@
 ; GCN: v_cndmask_b32_e32 [[RES:v[0-9]+]], 4.0,
 ; GCN: store_dword v{{.+}}, [[RES]]
 
-; OPT:  %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
-; OPT:  store <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, ptr addrspace(5) %alloca, align 4
-; OPT:  %0 = load <4 x float>, ptr addrspace(5) %alloca
-; OPT:  %1 = extractelement <4 x float> %0, i32 %sel2
-; OPT:  store float %1, ptr addrspace(1) %out, align 4
+; OPT:  %0 = extractelement <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, i32 %sel2
+; OPT:  store float %0, ptr addrspace(1) %out, align 4
 
 define amdgpu_kernel void @float4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
@@ -46,12 +43,8 @@ entry:
 ; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
 ; GCN:     store_dwordx4 v{{.+}},
 
-; OPT: %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
-; OPT: %0 = load <4 x float>, ptr addrspace(5) %alloca
-; OPT: %1 = insertelement <4 x float> %0, float 1.000000e+00, i32 %sel2
-; OPT: store <4 x float> %1, ptr addrspace(5) %alloca
-; OPT: %load = load <4 x float>, ptr addrspace(5) %alloca, align 4
-; OPT:  store <4 x float> %load, ptr addrspace(1) %out, align 4
+; OPT: %0 = insertelement <4 x float> undef, float 1.000000e+00, i32 %sel2
+; OPT: store <4 x float> %0, ptr addrspace(1) %out, align 4
 
 define amdgpu_kernel void @float4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
@@ -77,11 +70,8 @@ entry:
 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
 ; GCN:     v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]]
 
-; OPT: %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
-; OPT: store <4 x half> <half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400>, ptr addrspace(5) %alloca, align 2
-; OPT: %0 = load <4 x half>, ptr addrspace(5) %alloca
-; OPT: %1 = extractelement <4 x half> %0, i32 %sel2
-; OPT: store half %1, ptr addrspace(1) %out, align 2
+; OPT: %0 = extractelement <4 x half> <half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400>, i32 %sel2
+; OPT: store half %0, ptr addrspace(1) %out, align 2
 
 define amdgpu_kernel void @half4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
@@ -105,12 +95,8 @@ entry:
 ; GCN-NOT: buffer_
 ; GCN:     s_mov_b64 s[{{[0-9:]+}}], 0xffff
 
-; OPT: %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
-; OPT: %0 = load <4 x half>, ptr addrspace(5) %alloca
-; OPT: %1 = insertelement <4 x half> %0, half 0xH3C00, i32 %sel2
-; OPT: store <4 x half> %1, ptr addrspace(5) %alloca
-; OPT: %load = load <4 x half>, ptr addrspace(5) %alloca, align 2
-; OPT: store <4 x half> %load, ptr addrspace(1) %out, align 2
+; OPT: %0 = insertelement <4 x half> undef, half 0xH3C00, i32 %sel2
+; OPT: store <4 x half> %0, ptr addrspace(1) %out, align 2
 
 define amdgpu_kernel void @half4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
@@ -136,11 +122,8 @@ entry:
 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x20001
 ; GCN:     v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]]
 
-; OPT: %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
-; OPT: store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, ptr addrspace(5) %alloca, align 2
-; OPT: %0 = load <4 x i16>, ptr addrspace(5) %alloca
-; OPT: %1 = extractelement <4 x i16> %0, i32 %sel2
-; OPT: store i16 %1, ptr addrspace(1) %out, align 2
+; OPT: %0 = extractelement <4 x i16> <i16 1, i16 2, i16 3, i16 4>, i32 %sel2
+; OPT: store i16 %0, ptr addrspace(1) %out, align 2
 
 define amdgpu_kernel void @short4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
@@ -164,12 +147,8 @@ entry:
 ; GCN-NOT: buffer_
 ; GCN:     s_mov_b64 s[{{[0-9:]+}}], 0xffff
 
-; OPT: %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
-; OPT: %0 = load <4 x i16>, ptr addrspace(5) %alloca
-; OPT: %1 = insertelement <4 x i16> %0, i16 1, i32 %sel2
-; OPT: store <4 x i16> %1, ptr addrspace(5) %alloca
-; OPT: %load = load <4 x i16>, ptr addrspace(5) %alloca, align 2
-; OPT: store <4 x i16> %load, ptr addrspace(1) %out, align 2
+; OPT: %0 = insertelement <4 x i16> undef, i16 1, i32 %sel2
+; OPT: store <4 x i16> %0, ptr addrspace(1) %out, align 2
 
 define amdgpu_kernel void @short4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
@@ -193,8 +172,7 @@ entry:
 ; GCN-NOT: buffer_
 ; GCN: v_mov_b32_e32 v1, 0
 
-; OPT: %private_iptr = alloca <2 x i32>, align 8, addrspace(5)
-; OPT: %tmp1 = load i64, ptr addrspace(5) %private_iptr, align 8
+; OPT: ret i64 undef
 
 define i64 @ptr_alloca_bitcast() {
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll b/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
index 5651d1c922cc5..2e629a4d73d46 100644
--- a/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
@@ -12,8 +12,6 @@ target datalayout = "A5"
 
 ; FUNC-LABEL: @private_memory
 ; LOOP-NOT: alloca
-; LOOP: loop.header:
-; LOOP: br i1 %{{[^,]+}}, label %exit, label %loop.header
 
 ; FULL-UNROLL: alloca
 ; FULL-UNROLL-COUNT-256: store i32 {{[0-9]+}}, ptr addrspace(5)

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
index f91b5d6c2cbfe..c287196302f69 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
@@ -1,6 +1,7 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-ALLOCA %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-PROMOTE %s
-; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn- -passes=sroa %s -o %t.sroa.ll
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %t.sroa.ll | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-ALLOCA %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %t.sroa.ll | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-PROMOTE %s
+; RUN: opt -S -mtriple=amdgcn-- -passes='sroa,amdgpu-promote-alloca,instcombine' < %s | FileCheck -check-prefix=OPT %s
 
 target datalayout = "A5"
 
@@ -75,8 +76,7 @@ entry:
 ; OPT-LABEL: @vector_write_read_bitcast_to_float(
 ; OPT-NOT:   alloca
 ; OPT: bb2:
-; OPT:  %tmp.sroa.0.0 = phi <6 x float> [ undef, %bb ], [ %0, %bb2 ]
-; OPT:  %0 = insertelement <6 x float> %tmp.sroa.0.0, float %tmp72, i32 %tmp10
+; OPT:  %0 = insertelement <6 x float> undef, float %tmp71, i32 %tmp10
 ; OPT: .preheader:
 ; OPT:  %bc = bitcast <6 x float> %0 to <6 x i32>
 ; OPT:  %1 = extractelement <6 x i32> %bc, i32 %tmp20
@@ -84,24 +84,13 @@ entry:
 ; GCN-LABEL: {{^}}vector_write_read_bitcast_to_float:
 ; GCN-ALLOCA: buffer_store_dword
 
-; GCN-PROMOTE-COUNT-6: v_cmp_eq_u16
-; GCN-PROMOTE-COUNT-6: v_cndmask
+; GCN-PROMOTE: v_cmp_eq_u16
+; GCN-PROMOTE: v_cndmask
 
 ; GCN: s_cbranch
 
 ; GCN-ALLOCA: buffer_load_dword
 
-; GCN-PROMOTE: v_cmp_eq_u16
-; GCN-PROMOTE: v_cndmask
-; GCN-PROMOTE: v_cmp_eq_u16
-; GCN-PROMOTE: v_cndmask
-; GCN-PROMOTE: v_cmp_eq_u16
-; GCN-PROMOTE: v_cndmask
-; GCN-PROMOTE: v_cmp_eq_u16
-; GCN-PROMOTE: v_cndmask
-; GCN-PROMOTE: v_cmp_eq_u16
-; GCN-PROMOTE: v_cndmask
-
 ; GCN-PROMOTE: ScratchSize: 0
 
 define amdgpu_kernel void @vector_write_read_bitcast_to_float(ptr addrspace(1) %arg) {
@@ -147,8 +136,7 @@ bb15:                                             ; preds = %.preheader
 ; OPT-LABEL: @vector_write_read_bitcast_to_double(
 ; OPT-NOT:   alloca
 ; OPT: bb2:
-; OPT:  %tmp.sroa.0.0 = phi <6 x double> [ undef, %bb ], [ %0, %bb2 ]
-; OPT:  %0 = insertelement <6 x double> %tmp.sroa.0.0, double %tmp72, i32 %tmp10
+; OPT:  %0 = insertelement <6 x double> undef, double %tmp71, i32 %tmp10
 ; OPT: .preheader:
 ; OPT:  %bc = bitcast <6 x double> %0 to <6 x i64>
 ; OPT:  %1 = extractelement <6 x i64> %bc, i32 %tmp20
@@ -208,8 +196,7 @@ bb15:                                             ; preds = %.preheader
 ; OPT-LABEL: @vector_write_read_bitcast_to_i64(
 ; OPT-NOT:   alloca
 ; OPT: bb2:
-; OPT:  %tmp.sroa.0.0 = phi <6 x i64> [ undef, %bb ], [ %0, %bb2 ]
-; OPT:  %0 = insertelement <6 x i64> %tmp.sroa.0.0, i64 %tmp6, i32 %tmp9
+; OPT:  %0 = insertelement <6 x i64> undef, i64 %tmp6, i32 %tmp9
 ; OPT: .preheader:
 ; OPT:  %1 = extractelement <6 x i64> %0, i32 %tmp18
 
@@ -272,7 +259,7 @@ bb13:                                             ; preds = %.preheader
 ; OPT: store i32 %0, ptr addrspace(1) %out, align 4
 
 ; GCN-LABEL: {{^}}vector_read_alloca_bitcast_assume:
-; GCN-COUNT-4: buffer_store_dword
+; GCN-COUNT: buffer_store_dword
 
 define amdgpu_kernel void @vector_read_alloca_bitcast_assume(ptr addrspace(1) %out, i32 %index) {
 entry: