[llvm] 7007b99 - Revert "[AMDGPU] Use SSAUpdater in PromoteAlloca"
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 28 02:14:24 PDT 2023
Author: pvanhout
Date: 2023-06-28T11:14:17+02:00
New Revision: 7007b9934001ff03a50e355e57c3808c7c9c6350
URL: https://github.com/llvm/llvm-project/commit/7007b9934001ff03a50e355e57c3808c7c9c6350
DIFF: https://github.com/llvm/llvm-project/commit/7007b9934001ff03a50e355e57c3808c7c9c6350.diff
LOG: Revert "[AMDGPU] Use SSAUpdater in PromoteAlloca"
This reverts commit 091bfa76db64fbe96d0e53d99b2068cc05f6aa16.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll
llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 2dfb57792da5d..cd289e6470f2c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -28,9 +28,7 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/CaptureTracking.h"
-#include "llvm/Analysis/InstSimplifyFolder.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
@@ -40,7 +38,6 @@
#include "llvm/IR/PatternMatch.h"
#include "llvm/Pass.h"
#include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
#define DEBUG_TYPE "amdgpu-promote-alloca"
@@ -48,20 +45,20 @@ using namespace llvm;
namespace {
-static cl::opt<bool>
- DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
- cl::desc("Disable promote alloca to vector"),
- cl::init(false));
+static cl::opt<bool> DisablePromoteAllocaToVector(
+ "disable-promote-alloca-to-vector",
+ cl::desc("Disable promote alloca to vector"),
+ cl::init(false));
-static cl::opt<bool>
- DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
- cl::desc("Disable promote alloca to LDS"),
- cl::init(false));
+static cl::opt<bool> DisablePromoteAllocaToLDS(
+ "disable-promote-alloca-to-lds",
+ cl::desc("Disable promote alloca to LDS"),
+ cl::init(false));
static cl::opt<unsigned> PromoteAllocaToVectorLimit(
- "amdgpu-promote-alloca-to-vector-limit",
- cl::desc("Maximum byte size to consider promote alloca to vector"),
- cl::init(0));
+ "amdgpu-promote-alloca-to-vector-limit",
+ cl::desc("Maximum byte size to consider promote alloca to vector"),
+ cl::init(0));
// Shared implementation which can do both promotion to vector and to LDS.
class AMDGPUPromoteAllocaImpl {
@@ -83,16 +80,17 @@ class AMDGPUPromoteAllocaImpl {
/// BaseAlloca is the alloca root the search started from.
/// Val may be that alloca or a recursive user of it.
- bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val,
- std::vector<Value *> &WorkList) const;
+ bool collectUsesWithPtrTypes(Value *BaseAlloca,
+ Value *Val,
+ std::vector<Value*> &WorkList) const;
/// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
/// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
/// Returns true if both operands are derived from the same alloca. Val should
/// be the same value as one of the input operands of UseInst.
bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
- Instruction *UseInst, int OpIdx0,
- int OpIdx1) const;
+ Instruction *UseInst,
+ int OpIdx0, int OpIdx1) const;
/// Check whether we have enough local memory for promotion.
bool hasSufficientLocalMem(const Function &F);
@@ -255,10 +253,6 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
Changed = true;
}
- // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
- // dangling pointers. If we want to reuse it past this point, the loop above
- // would need to be updated to remove successfully promoted allocas.
-
return Changed;
}
@@ -275,10 +269,6 @@ static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI,
using namespace PatternMatch;
// For now we only care about non-volatile memsets that affect the whole type
// (start at index 0 and fill the whole alloca).
- //
- // TODO: Now that we moved to PromoteAlloca we could handle any memsets
- // (except maybe volatile ones?) - we just need to use shufflevector if it
- // only affects a subset of the vector.
const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType());
return I->getOperand(0) == AI &&
match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
@@ -329,107 +319,6 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
return ConstantInt::get(GEP->getContext(), Quot);
}
-void promoteAllocaUserToVector(
- Instruction *Inst, const DataLayout &DL, SSAUpdater &Updater,
- FixedVectorType *VectorTy, unsigned VecStoreSize, unsigned ElementSize,
- DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo,
- std::map<GetElementPtrInst *, Value *> &GEPVectorIdx) {
- // Note: we use InstSimplifyFolder because it can leverage the DataLayout
- // to do more folding, especially in the case of vector splats.
- IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
- InstSimplifyFolder(DL));
- Builder.SetInsertPoint(Inst);
-
- Type *VecEltTy = VectorTy->getElementType();
-
- switch (Inst->getOpcode()) {
- case Instruction::Load: {
- Value *Vec = Updater.GetValueAtEndOfBlock(Inst->getParent());
- Value *Index = calculateVectorIndex(
- cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx);
-
- // loading the full vector
- if (DL.getTypeStoreSize(Inst->getType()) == VecStoreSize) {
- assert(cast<Constant>(Index)->isZeroValue());
-
- Value *NewVal = Builder.CreateBitCast(Vec, Inst->getType());
- Inst->replaceAllUsesWith(NewVal);
- break;
- }
-
- Value *ExtractElement = Builder.CreateExtractElement(Vec, Index);
- if (Inst->getType() != VecEltTy)
- ExtractElement =
- Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType());
-
- Inst->replaceAllUsesWith(ExtractElement);
- break;
- }
- case Instruction::Store: {
- StoreInst *SI = cast<StoreInst>(Inst);
- Value *Vec = Updater.GetValueAtEndOfBlock(Inst->getParent());
- Value *Index = calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx);
- Value *Elt = SI->getValueOperand();
-
- // Storing the full vector
- if (DL.getTypeStoreSize(Elt->getType()) == VecStoreSize) {
- assert(cast<Constant>(Index)->isZeroValue());
- Updater.AddAvailableValue(Inst->getParent(),
- Builder.CreateBitCast(Elt, VectorTy));
- break;
- }
-
- if (Elt->getType() != VecEltTy)
- Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy);
- Value *NewVec = Builder.CreateInsertElement(Vec, Elt, Index);
-
- Updater.AddAvailableValue(Inst->getParent(), NewVec);
- break;
- }
- case Instruction::Call: {
- if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst)) {
- ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
- unsigned NumCopied = Length->getZExtValue() / ElementSize;
- MemTransferInfo *TI = &TransferInfo[cast<MemTransferInst>(Inst)];
- unsigned SrcBegin = TI->SrcIndex->getZExtValue();
- unsigned DestBegin = TI->DestIndex->getZExtValue();
-
- SmallVector<int> Mask;
- for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
- if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
- Mask.push_back(SrcBegin++);
- } else {
- Mask.push_back(Idx);
- }
- }
-
- Value *Vec = Updater.GetValueAtEndOfBlock(Inst->getParent());
- Value *NewVec = Builder.CreateShuffleVector(Vec, Mask);
-
- Updater.AddAvailableValue(Inst->getParent(), NewVec);
- } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
- Value *Elt = MSI->getOperand(1);
- if (DL.getTypeStoreSize(VecEltTy) > 1) {
- Value *EltBytes =
- Builder.CreateVectorSplat(DL.getTypeStoreSize(VecEltTy), Elt);
- Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
- }
-
- Value *Splat =
- Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt);
-
- Updater.AddAvailableValue(Inst->getParent(), Splat);
- } else {
- llvm_unreachable("Unsupported call when promoting alloca to vector");
- }
- break;
- }
-
- default:
- llvm_unreachable("Inconsistency in instructions promotable to vector");
- }
-}
-
// FIXME: Should try to pick the most likely to be profitable allocas first.
bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
@@ -476,7 +365,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
std::map<GetElementPtrInst *, Value *> GEPVectorIdx;
SmallVector<Instruction *> WorkList;
- SmallVector<Instruction *> UsersToRemove;
SmallVector<Instruction *> DeferredInsts;
SmallVector<Use *, 8> Uses;
DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
@@ -505,18 +393,12 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
return RejectUser(Inst, "pointer is being stored");
Type *AccessTy = getLoadStoreType(Inst);
- if (AccessTy->isAggregateType())
- return RejectUser(Inst, "unsupported load/store as aggregate");
- assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
-
Ptr = Ptr->stripPointerCasts();
- // Alloca already accessed as vector.
+ // Alloca already accessed as vector, leave alone.
if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) ==
- DL->getTypeStoreSize(AccessTy)) {
- WorkList.push_back(Inst);
+ DL->getTypeStoreSize(AccessTy))
continue;
- }
// Check that this is a simple access of a vector element.
bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
@@ -534,7 +416,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
// Look through bitcasts.
for (Use &U : Inst->uses())
Uses.push_back(&U);
- UsersToRemove.push_back(Inst);
continue;
}
@@ -548,7 +429,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
GEPVectorIdx[GEP] = Index;
for (Use &U : Inst->uses())
Uses.push_back(&U);
- UsersToRemove.push_back(Inst);
continue;
}
@@ -601,17 +481,13 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
}
// Ignore assume-like intrinsics and comparisons used in assumes.
- if (isAssumeLikeIntrinsic(Inst)) {
- UsersToRemove.push_back(Inst);
+ if (isAssumeLikeIntrinsic(Inst))
continue;
- }
if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
return isAssumeLikeIntrinsic(cast<Instruction>(U));
- })) {
- UsersToRemove.push_back(Inst);
+ }))
continue;
- }
return RejectUser(Inst, "unhandled alloca user");
}
@@ -630,66 +506,80 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
<< *VectorTy << '\n');
- SSAUpdater Updater;
- Updater.Initialize(VectorTy, "promotealloca");
-
- // alloca is uninitialized memory. Imitate that by making the first value
- // undef.
- Updater.AddAvailableValue(Alloca.getParent(), UndefValue::get(VectorTy));
-
- // Bucket up uses of the alloca by the block they occur in.
- // This is important because we have to handle multiple defs/uses in a block
- // ourselves: SSAUpdater is purely for cross-block references.
- DenseMap<BasicBlock *, SmallDenseSet<Instruction *>> UsesByBlock;
- for (Instruction *User : WorkList)
- UsesByBlock[User->getParent()].insert(User);
-
- const unsigned VecStoreSize = DL->getTypeStoreSize(VectorTy);
- for (Instruction *User : WorkList) {
- BasicBlock *BB = User->getParent();
- auto &BlockUses = UsesByBlock[BB];
-
- // Already processed, skip.
- if (BlockUses.empty())
- continue;
-
- // Only user in the block, directly process it.
- if (BlockUses.size() == 1) {
- promoteAllocaUserToVector(User, *DL, Updater, VectorTy, VecStoreSize,
- ElementSize, TransferInfo, GEPVectorIdx);
- continue;
+ for (Instruction *Inst : WorkList) {
+ IRBuilder<> Builder(Inst);
+ switch (Inst->getOpcode()) {
+ case Instruction::Load: {
+ Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
+ Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+ Type *VecPtrTy = VectorTy->getPointerTo(Alloca.getAddressSpace());
+ Value *BitCast = Builder.CreateBitCast(&Alloca, VecPtrTy);
+ Value *VecValue =
+ Builder.CreateAlignedLoad(VectorTy, BitCast, Alloca.getAlign());
+ Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
+ if (Inst->getType() != VecEltTy)
+ ExtractElement =
+ Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType());
+ Inst->replaceAllUsesWith(ExtractElement);
+ Inst->eraseFromParent();
+ break;
}
-
- // Multiple users in the block, do a linear scan to promote users in order.
- for (Instruction &Inst : *BB) {
- if (!BlockUses.contains(&Inst))
- continue;
-
- promoteAllocaUserToVector(&Inst, *DL, Updater, VectorTy, VecStoreSize,
- ElementSize, TransferInfo, GEPVectorIdx);
+ case Instruction::Store: {
+ StoreInst *SI = cast<StoreInst>(Inst);
+ Value *Ptr = SI->getPointerOperand();
+ Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+ Type *VecPtrTy = VectorTy->getPointerTo(Alloca.getAddressSpace());
+ Value *BitCast = Builder.CreateBitCast(&Alloca, VecPtrTy);
+ Value *VecValue =
+ Builder.CreateAlignedLoad(VectorTy, BitCast, Alloca.getAlign());
+ Value *Elt = SI->getValueOperand();
+ if (Elt->getType() != VecEltTy)
+ Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy);
+ Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index);
+ Builder.CreateAlignedStore(NewVecValue, BitCast, Alloca.getAlign());
+ Inst->eraseFromParent();
+ break;
+ }
+ case Instruction::Call: {
+ if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst)) {
+ ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
+ unsigned NumCopied = Length->getZExtValue() / ElementSize;
+ MemTransferInfo *TI = &TransferInfo[cast<MemTransferInst>(Inst)];
+ unsigned SrcBegin = TI->SrcIndex->getZExtValue();
+ unsigned DestBegin = TI->DestIndex->getZExtValue();
+
+ SmallVector<int> Mask;
+ for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
+ if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
+ Mask.push_back(SrcBegin++);
+ } else {
+ Mask.push_back(Idx);
+ }
+ }
+ Type *VecPtrTy = VectorTy->getPointerTo(Alloca.getAddressSpace());
+ Value *BitCast = Builder.CreateBitCast(&Alloca, VecPtrTy);
+ Value *VecValue =
+ Builder.CreateAlignedLoad(VectorTy, BitCast, Alloca.getAlign());
+ Value *NewVecValue = Builder.CreateShuffleVector(VecValue, Mask);
+ Builder.CreateAlignedStore(NewVecValue, BitCast, Alloca.getAlign());
+
+ Inst->eraseFromParent();
+ } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
+ // Ensure the length parameter of the memsets matches the new vector
+ // type's. In general, the type size shouldn't change so this is a
+ // no-op, but it's better to be safe.
+ MSI->setOperand(2, Builder.getInt64(DL->getTypeStoreSize(VectorTy)));
+ } else {
+ llvm_unreachable("Unsupported call when promoting alloca to vector");
+ }
+ break;
}
- // Clear the block so we know it's been processed.
- BlockUses.clear();
- }
-
- // Delete worklist instructions
- for (Instruction *I : WorkList) {
- assert(I->use_empty());
- I->eraseFromParent();
- }
-
- // Delete all the users that are known to be removeable.
- for (Instruction *I : reverse(UsersToRemove)) {
- I->dropDroppableUses();
- assert(I->use_empty());
- I->eraseFromParent();
+ default:
+ llvm_unreachable("Inconsistency in instructions promotable to vector");
+ }
}
- // Alloca should now be dead too.
- assert(Alloca.use_empty());
- Alloca.eraseFromParent();
-
return true;
}
@@ -1177,7 +1067,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
CurrentLocalMemUsage = NewSize;
- std::vector<Value *> WorkList;
+ std::vector<Value*> WorkList;
if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
@@ -1210,7 +1100,9 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
TID = Builder.CreateAdd(TID, TIdZ);
Value *Indices[] = {
- Constant::getNullValue(Type::getInt32Ty(Mod->getContext())), TID};
+ Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
+ TID
+ };
Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
I.mutateType(Offset->getType());
@@ -1324,9 +1216,10 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
MemTransferInst *MI = cast<MemTransferInst>(Intr);
- auto *B = Builder.CreateMemTransferInst(
- ID, MI->getRawDest(), MI->getDestAlign(), MI->getRawSource(),
- MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
+ auto *B =
+ Builder.CreateMemTransferInst(ID, MI->getRawDest(), MI->getDestAlign(),
+ MI->getRawSource(), MI->getSourceAlign(),
+ MI->getLength(), MI->isVolatile());
for (unsigned I = 0; I != 2; ++I) {
if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {
diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
index 4da6f0e446689..97383490841e5 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
@@ -13,21 +13,38 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s16, s33
-; GCN-NEXT: s_mov_b32 s33, s32
+; GCN-NEXT: s_add_i32 s33, s32, 0xfc0
+; GCN-NEXT: s_and_b32 s33, s33, 0xfffff000
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
-; GCN-NEXT: s_addk_i32 s32, 0x800
+; GCN-NEXT: s_addk_i32 s32, 0x3000
; GCN-NEXT: v_writelane_b32 v43, s16, 0
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, extern_func at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, extern_func at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GCN-NEXT: v_writelane_b32 v42, s30, 0
-; GCN-NEXT: v_mov_b32_e32 v0, v8
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT: v_writelane_b32 v42, s30, 0
+; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:92
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:88
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:84
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:80
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:76
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:72
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:68
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:64
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, v8
; GCN-NEXT: v_writelane_b32 v42, s31, 1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
@@ -41,10 +58,10 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 {
; GCN-NEXT: v_readlane_b32 s30, v42, 0
; GCN-NEXT: v_readlane_b32 s4, v43, 0
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: s_addk_i32 s32, 0xf800
+; GCN-NEXT: s_addk_i32 s32, 0xd000
; GCN-NEXT: s_mov_b32 s33, s4
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
index 68737cb227a00..785ac5ad51a04 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=sroa,amdgpu-promote-alloca < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
; Make sure that array alloca loaded and stored as multi-element aggregates are handled correctly
; Strictly the promote-alloca pass shouldn't have to deal with this case as it is non-canonical, but
@@ -16,16 +16,19 @@
define amdgpu_vs void @promote_1d_aggr() #0 {
; CHECK-LABEL: @promote_1d_aggr(
+; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
; CHECK-NEXT: [[F1:%.*]] = alloca [1 x float], align 4, addrspace(5)
; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK:%.*]], ptr addrspace(1) @block, i32 0, i32 1
; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
+; CHECK-NEXT: store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4
; CHECK-NEXT: [[FOO3:%.*]] = load [1 x float], ptr addrspace(1) @block, align 4
-; CHECK-NEXT: [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [1 x float] [[FOO3]], 0
-; CHECK-NEXT: [[FOO3_FCA_0_GEP:%.*]] = getelementptr inbounds [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 0
-; CHECK-NEXT: store float [[FOO3_FCA_0_EXTRACT]], ptr addrspace(5) [[FOO3_FCA_0_GEP]], align 4
-; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO1]]
+; CHECK-NEXT: store [1 x float] [[FOO3]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(5) [[I]], align 4
+; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
; CHECK-NEXT: [[FOO6:%.*]] = load float, ptr addrspace(5) [[FOO5]], align 4
-; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> undef, float [[FOO6]], i32 0
+; CHECK-NEXT: [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, ptr addrspace(5) [[FOO7]], align 16
+; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[FOO6]], i32 0
; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[FOO6]], i32 1
; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[FOO6]], i32 2
; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[FOO6]], i32 3
@@ -57,12 +60,22 @@ define amdgpu_vs void @promote_1d_aggr() #0 {
define amdgpu_vs void @promote_store_aggr() #0 {
; CHECK-LABEL: @promote_store_aggr(
+; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
+; CHECK-NEXT: [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5)
; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) @block2, align 4
-; CHECK-NEXT: [[FOO3:%.*]] = sitofp i32 [[FOO1]] to float
-; CHECK-NEXT: [[FOO6_FCA_0_INSERT:%.*]] = insertvalue [2 x float] poison, float [[FOO3]], 0
-; CHECK-NEXT: [[FOO6_FCA_1_INSERT:%.*]] = insertvalue [2 x float] [[FOO6_FCA_0_INSERT]], float 2.000000e+00, 1
+; CHECK-NEXT: store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4
+; CHECK-NEXT: [[FOO2:%.*]] = load i32, ptr addrspace(5) [[I]], align 4
+; CHECK-NEXT: [[FOO3:%.*]] = sitofp i32 [[FOO2]] to float
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3]], i32 0
+; CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float 2.000000e+00, i64 1
+; CHECK-NEXT: store <2 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[FOO6:%.*]] = load [2 x float], ptr addrspace(5) [[F1]], align 4
; CHECK-NEXT: [[FOO7:%.*]] = getelementptr [[BLOCK2:%.*]], ptr addrspace(1) @block2, i32 0, i32 1
-; CHECK-NEXT: store [2 x float] [[FOO6_FCA_1_INSERT]], ptr addrspace(1) [[FOO7]], align 4
+; CHECK-NEXT: store [2 x float] [[FOO6]], ptr addrspace(1) [[FOO7]], align 4
; CHECK-NEXT: store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, ptr addrspace(1) @pv, align 16
; CHECK-NEXT: ret void
;
@@ -87,18 +100,23 @@ define amdgpu_vs void @promote_store_aggr() #0 {
define amdgpu_vs void @promote_load_from_store_aggr() #0 {
; CHECK-LABEL: @promote_load_from_store_aggr(
+; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
+; CHECK-NEXT: [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5)
; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 1
; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
+; CHECK-NEXT: store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4
; CHECK-NEXT: [[FOO3:%.*]] = load [2 x float], ptr addrspace(1) @block3, align 4
-; CHECK-NEXT: [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> undef, float [[FOO3_FCA_0_EXTRACT]], i32 0
-; CHECK-NEXT: [[FOO3_FCA_1_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3_FCA_1_EXTRACT]], i64 1
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO1]]
-; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
-; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP3]], i32 1
-; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP3]], i32 3
+; CHECK-NEXT: store [2 x float] [[FOO3]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(5) [[I]], align 4
+; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 [[FOO4]]
+; CHECK-NEXT: [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, ptr addrspace(5) [[FOO7]], align 16
+; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[TMP2]], i32 0
+; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP2]], i32 1
+; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP2]], i32 2
+; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP2]], i32 3
; CHECK-NEXT: store <4 x float> [[FOO12]], ptr addrspace(1) @pv, align 16
; CHECK-NEXT: ret void
;
@@ -124,7 +142,22 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 {
define amdgpu_vs void @promote_memmove_aggr() #0 {
; CHECK-LABEL: @promote_memmove_aggr(
-; CHECK-NEXT: store float 1.000000e+00, ptr addrspace(1) @pv, align 4
+; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
+; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[FOO1:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
+; CHECK-NEXT: [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 1.000000e+00, i64 1
+; CHECK-NEXT: store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
+; CHECK-NEXT: [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 2.000000e+00, i64 3
+; CHECK-NEXT: store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 4>
+; CHECK-NEXT: store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0
+; CHECK-NEXT: store float [[TMP8]], ptr addrspace(1) @pv, align 4
; CHECK-NEXT: ret void
;
%f1 = alloca [5 x float], addrspace(5)
@@ -141,12 +174,24 @@ define amdgpu_vs void @promote_memmove_aggr() #0 {
define amdgpu_vs void @promote_memcpy_aggr() #0 {
; CHECK-LABEL: @promote_memcpy_aggr(
+; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
+; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
+; CHECK-NEXT: [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 2.000000e+00, i64 3
+; CHECK-NEXT: store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <5 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 2.000000e+00, float 0.000000e+00>, float 3.000000e+00, i32 [[FOO4]]
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0
-; CHECK-NEXT: store float [[TMP3]], ptr addrspace(1) @pv, align 4
+; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
+; CHECK-NEXT: [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 3.000000e+00, i32 [[FOO4]]
+; CHECK-NEXT: store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
+; CHECK-NEXT: store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0
+; CHECK-NEXT: store float [[TMP8]], ptr addrspace(1) @pv, align 4
; CHECK-NEXT: ret void
;
%f1 = alloca [5 x float], addrspace(5)
@@ -168,7 +213,22 @@ define amdgpu_vs void @promote_memcpy_aggr() #0 {
define amdgpu_vs void @promote_memcpy_identity_aggr() #0 {
; CHECK-LABEL: @promote_memcpy_identity_aggr(
-; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) @pv, align 4
+; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
+; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[FOO1:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
+; CHECK-NEXT: [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 1.000000e+00, i64 1
+; CHECK-NEXT: store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
+; CHECK-NEXT: [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 2.000000e+00, i64 3
+; CHECK-NEXT: store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4>
+; CHECK-NEXT: store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0
+; CHECK-NEXT: store float [[TMP8]], ptr addrspace(1) @pv, align 4
; CHECK-NEXT: ret void
;
%f1 = alloca [5 x float], addrspace(5)
@@ -188,26 +248,8 @@ define amdgpu_vs void @promote_memcpy_two_aggrs() #0 {
; CHECK-LABEL: @promote_memcpy_two_aggrs(
; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
; CHECK-NEXT: [[F2:%.*]] = alloca [5 x float], align 4, addrspace(5)
-; CHECK-NEXT: [[DOTFCA_0_GEP1:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 0
-; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP1]], align 4
-; CHECK-NEXT: [[DOTFCA_1_GEP2:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
-; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP2]], align 4
-; CHECK-NEXT: [[DOTFCA_2_GEP3:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 2
-; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP3]], align 4
-; CHECK-NEXT: [[DOTFCA_3_GEP4:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
-; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP4]], align 4
-; CHECK-NEXT: [[DOTFCA_4_GEP5:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 4
-; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP5]], align 4
-; CHECK-NEXT: [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 0
-; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP]], align 4
-; CHECK-NEXT: [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 1
-; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP]], align 4
-; CHECK-NEXT: [[DOTFCA_2_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 2
-; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP]], align 4
-; CHECK-NEXT: [[DOTFCA_3_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 3
-; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP]], align 4
-; CHECK-NEXT: [[DOTFCA_4_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 4
-; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP]], align 4
+; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F2]], align 4
; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
@@ -241,16 +283,7 @@ define amdgpu_vs void @promote_memcpy_two_aggrs() #0 {
define amdgpu_vs void @promote_memcpy_p1p5_aggr(ptr addrspace(1) inreg %src) #0 {
; CHECK-LABEL: @promote_memcpy_p1p5_aggr(
; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
-; CHECK-NEXT: [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 0
-; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP]], align 4
-; CHECK-NEXT: [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
-; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP]], align 4
-; CHECK-NEXT: [[DOTFCA_2_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 2
-; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP]], align 4
-; CHECK-NEXT: [[DOTFCA_3_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
-; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP]], align 4
-; CHECK-NEXT: [[DOTFCA_4_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 4
-; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP]], align 4
+; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
@@ -272,12 +305,21 @@ define amdgpu_vs void @promote_memcpy_p1p5_aggr(ptr addrspace(1) inreg %src) #0
define amdgpu_vs void @promote_memcpy_inline_aggr() #0 {
; CHECK-LABEL: @promote_memcpy_inline_aggr(
+; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
+; CHECK-NEXT: store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <5 x float> zeroinitializer, float 3.000000e+00, i32 [[FOO4]]
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0
-; CHECK-NEXT: store float [[TMP3]], ptr addrspace(1) @pv, align 4
+; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
+; CHECK-NEXT: [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 3.000000e+00, i32 [[FOO4]]
+; CHECK-NEXT: store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <5 x float> [[TMP3]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
+; CHECK-NEXT: store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <5 x float> [[TMP5]], i32 0
+; CHECK-NEXT: store float [[TMP6]], ptr addrspace(1) @pv, align 4
; CHECK-NEXT: ret void
;
%f1 = alloca [5 x float], addrspace(5)
@@ -305,16 +347,30 @@ declare void @llvm.memmove.p5i8.p5i8.i32(ptr addrspace(5) nocapture writeonly, p
define amdgpu_ps void @promote_double_aggr() #0 {
; CHECK-LABEL: @promote_double_aggr(
+; CHECK-NEXT: [[S:%.*]] = alloca [2 x double], align 8, addrspace(5)
; CHECK-NEXT: [[FOO:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0
; CHECK-NEXT: [[FOO1:%.*]] = load double, ptr addrspace(1) [[FOO]], align 8
; CHECK-NEXT: [[FOO2:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 1
; CHECK-NEXT: [[FOO3:%.*]] = load double, ptr addrspace(1) [[FOO2]], align 8
; CHECK-NEXT: [[FOO4:%.*]] = insertvalue [2 x double] undef, double [[FOO1]], 0
; CHECK-NEXT: [[FOO5:%.*]] = insertvalue [2 x double] [[FOO4]], double [[FOO3]], 1
-; CHECK-NEXT: [[FOO5_FCA_0_EXTRACT:%.*]] = extractvalue [2 x double] [[FOO5]], 0
-; CHECK-NEXT: [[FOO5_FCA_1_EXTRACT:%.*]] = extractvalue [2 x double] [[FOO5]], 1
-; CHECK-NEXT: [[FOO10:%.*]] = fadd double [[FOO5_FCA_1_EXTRACT]], [[FOO5_FCA_1_EXTRACT]]
-; CHECK-NEXT: [[FOO16:%.*]] = fadd double [[FOO10]], [[FOO5_FCA_1_EXTRACT]]
+; CHECK-NEXT: store [2 x double] [[FOO5]], ptr addrspace(5) [[S]], align 8
+; CHECK-NEXT: [[FOO6:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i64 1
+; CHECK-NEXT: [[FOO8:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i64 1
+; CHECK-NEXT: [[FOO10:%.*]] = fadd double [[TMP2]], [[TMP4]]
+; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[FOO10]], i32 0
+; CHECK-NEXT: store <2 x double> [[TMP6]], ptr addrspace(5) [[S]], align 8
+; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
+; CHECK-NEXT: [[FOO14:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1
+; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i64 1
+; CHECK-NEXT: [[FOO16:%.*]] = fadd double [[TMP8]], [[TMP10]]
; CHECK-NEXT: [[FOO17:%.*]] = fptrunc double [[FOO16]] to float
; CHECK-NEXT: [[FOO18:%.*]] = insertelement <4 x float> undef, float [[FOO17]], i32 0
; CHECK-NEXT: [[FOO19:%.*]] = insertelement <4 x float> [[FOO18]], float [[FOO17]], i32 1
@@ -354,6 +410,21 @@ define amdgpu_ps void @promote_double_aggr() #0 {
define amdgpu_kernel void @alloca_struct() #0 {
; CHECK-LABEL: @alloca_struct(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load !0
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1:![0-9]+]], !invariant.load !0
+; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2:![0-9]+]]
+; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]]
+; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.z(), !range [[RNG2]]
+; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
+; CHECK-NEXT: [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
+; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x [2 x %struct]], ptr addrspace(3) @alloca_struct.alloca, i32 0, i32 [[TMP13]]
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
index 4cec3bd41ce2f..3596c96b8cd79 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
@@ -6,7 +6,7 @@
@global_array1 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4
; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) {
-; IR-NOT: alloca [10 x i32]
+; IR: alloca [10 x i32]
; ASM-LABEL: {{^}}promote_alloca_size_256:
; ASM: .amdgpu_lds llvm.amdgcn.kernel.promote_alloca_size_256.lds, 60000, 16
; ASM-NOT: .amdgpu_lds
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
index a99c01edcc12d..f31421de517cb 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
@@ -1,13 +1,19 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca,sroa < %s | FileCheck %s
; Checks that memsets don't block PromoteAlloca.
+; Note: memsets are just updated with the new type size. They are not eliminated which means
+; the original alloca also stay. This puts a bit more load on SROA.
+; If PromoteAlloca is moved to SSAUpdater, we could just entirely replace the memsets with
+; e.g. ConstantAggregate.
+
define amdgpu_kernel void @memset_all_zero(i64 %val) {
; CHECK-LABEL: @memset_all_zero(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <6 x i64> zeroinitializer, i64 [[VAL:%.*]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i64 1
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <6 x i64> [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i64 1
; CHECK-NEXT: ret void
;
entry:
@@ -24,7 +30,8 @@ define amdgpu_kernel void @memset_all_5(i64 %val) {
; CHECK-LABEL: @memset_all_5(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> <i64 361700864190383365, i64 361700864190383365, i64 361700864190383365, i64 361700864190383365>, i64 [[VAL:%.*]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i64 1
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i64 1
; CHECK-NEXT: ret void
;
entry:
@@ -40,9 +47,11 @@ entry:
define amdgpu_kernel void @memset_volatile_nopromote(i64 %val) {
; CHECK-LABEL: @memset_volatile_nopromote(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
-; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 32, i1 true)
-; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4
+; CHECK-NEXT: [[STACK_SROA_0:%.*]] = alloca i64, align 8, addrspace(5)
+; CHECK-NEXT: [[STACK_SROA_2:%.*]] = alloca [3 x i64], align 8, addrspace(5)
+; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_0]], i8 0, i64 8, i1 true)
+; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_2]], i8 0, i64 24, i1 true)
+; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK_SROA_0]], align 8
; CHECK-NEXT: ret void
;
entry:
@@ -55,9 +64,11 @@ entry:
define amdgpu_kernel void @memset_badsize_nopromote(i64 %val) {
; CHECK-LABEL: @memset_badsize_nopromote(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
-; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 31, i1 true)
-; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4
+; CHECK-NEXT: [[STACK_SROA_0:%.*]] = alloca i64, align 8, addrspace(5)
+; CHECK-NEXT: [[STACK_SROA_2:%.*]] = alloca [23 x i8], align 4, addrspace(5)
+; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_0]], i8 0, i64 8, i1 true)
+; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 4 [[STACK_SROA_2]], i8 0, i64 23, i1 true)
+; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK_SROA_0]], align 8
; CHECK-NEXT: ret void
;
entry:
@@ -70,10 +81,8 @@ entry:
define amdgpu_kernel void @memset_offset_ptr_nopromote(i64 %val) {
; CHECK-LABEL: @memset_offset_ptr_nopromote(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr [4 x i64], ptr addrspace(5) [[STACK]], i64 0, i64 1
-; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[GEP]], i8 0, i64 24, i1 true)
-; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4
+; CHECK-NEXT: [[STACK_SROA_1:%.*]] = alloca [3 x i64], align 8, addrspace(5)
+; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_1]], i8 0, i64 24, i1 true)
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll
index 8df15e3f7e29a..70b4e94f36c07 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll
@@ -4,10 +4,15 @@
define i64 @test_pointer_array(i64 %v) {
; OPT-LABEL: @test_pointer_array(
; OPT-NEXT: entry:
-; OPT-NEXT: [[TMP0:%.*]] = inttoptr i64 [[V:%.*]] to ptr
-; OPT-NEXT: [[TMP1:%.*]] = insertelement <3 x ptr> undef, ptr [[TMP0]], i32 0
-; OPT-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP0]] to i64
-; OPT-NEXT: ret i64 [[TMP2]]
+; OPT-NEXT: [[A:%.*]] = alloca [3 x ptr], align 16, addrspace(5)
+; OPT-NEXT: [[TMP0:%.*]] = load <3 x ptr>, ptr addrspace(5) [[A]], align 16
+; OPT-NEXT: [[TMP1:%.*]] = inttoptr i64 [[V:%.*]] to ptr
+; OPT-NEXT: [[TMP2:%.*]] = insertelement <3 x ptr> [[TMP0]], ptr [[TMP1]], i32 0
+; OPT-NEXT: store <3 x ptr> [[TMP2]], ptr addrspace(5) [[A]], align 16
+; OPT-NEXT: [[TMP3:%.*]] = load <3 x ptr>, ptr addrspace(5) [[A]], align 16
+; OPT-NEXT: [[TMP4:%.*]] = extractelement <3 x ptr> [[TMP3]], i32 0
+; OPT-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; OPT-NEXT: ret i64 [[TMP5]]
;
entry:
%a = alloca [3 x ptr], align 16, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
index 083ed999ac371..adabeab379505 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
@@ -1,6 +1,6 @@
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -passes=sroa,amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -passes=amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
; GCN-LABEL: {{^}}float4_alloca_store4:
; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4
@@ -11,8 +11,11 @@
; GCN: v_cndmask_b32_e32 [[RES:v[0-9]+]], 4.0,
; GCN: store_dword v{{.+}}, [[RES]]
-; OPT: %0 = extractelement <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, i32 %sel2
-; OPT: store float %0, ptr addrspace(1) %out, align 4
+; OPT: %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+; OPT: store <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, ptr addrspace(5) %alloca, align 4
+; OPT: %0 = load <4 x float>, ptr addrspace(5) %alloca
+; OPT: %1 = extractelement <4 x float> %0, i32 %sel2
+; OPT: store float %1, ptr addrspace(1) %out, align 4
define amdgpu_kernel void @float4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
entry:
@@ -43,8 +46,12 @@ entry:
; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
; GCN: store_dwordx4 v{{.+}},
-; OPT: %0 = insertelement <4 x float> undef, float 1.000000e+00, i32 %sel2
-; OPT: store <4 x float> %0, ptr addrspace(1) %out, align 4
+; OPT: %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+; OPT: %0 = load <4 x float>, ptr addrspace(5) %alloca
+; OPT: %1 = insertelement <4 x float> %0, float 1.000000e+00, i32 %sel2
+; OPT: store <4 x float> %1, ptr addrspace(5) %alloca
+; OPT: %load = load <4 x float>, ptr addrspace(5) %alloca, align 4
+; OPT: store <4 x float> %load, ptr addrspace(1) %out, align 4
define amdgpu_kernel void @float4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
entry:
@@ -70,8 +77,11 @@ entry:
; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]]
-; OPT: %0 = extractelement <4 x half> <half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400>, i32 %sel2
-; OPT: store half %0, ptr addrspace(1) %out, align 2
+; OPT: %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+; OPT: store <4 x half> <half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400>, ptr addrspace(5) %alloca, align 2
+; OPT: %0 = load <4 x half>, ptr addrspace(5) %alloca
+; OPT: %1 = extractelement <4 x half> %0, i32 %sel2
+; OPT: store half %1, ptr addrspace(1) %out, align 2
define amdgpu_kernel void @half4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
entry:
@@ -95,8 +105,12 @@ entry:
; GCN-NOT: buffer_
; GCN: s_mov_b64 s[{{[0-9:]+}}], 0xffff
-; OPT: %0 = insertelement <4 x half> undef, half 0xH3C00, i32 %sel2
-; OPT: store <4 x half> %0, ptr addrspace(1) %out, align 2
+; OPT: %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+; OPT: %0 = load <4 x half>, ptr addrspace(5) %alloca
+; OPT: %1 = insertelement <4 x half> %0, half 0xH3C00, i32 %sel2
+; OPT: store <4 x half> %1, ptr addrspace(5) %alloca
+; OPT: %load = load <4 x half>, ptr addrspace(5) %alloca, align 2
+; OPT: store <4 x half> %load, ptr addrspace(1) %out, align 2
define amdgpu_kernel void @half4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
entry:
@@ -122,8 +136,11 @@ entry:
; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x20001
; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]]
-; OPT: %0 = extractelement <4 x i16> <i16 1, i16 2, i16 3, i16 4>, i32 %sel2
-; OPT: store i16 %0, ptr addrspace(1) %out, align 2
+; OPT: %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+; OPT: store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, ptr addrspace(5) %alloca, align 2
+; OPT: %0 = load <4 x i16>, ptr addrspace(5) %alloca
+; OPT: %1 = extractelement <4 x i16> %0, i32 %sel2
+; OPT: store i16 %1, ptr addrspace(1) %out, align 2
define amdgpu_kernel void @short4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
entry:
@@ -147,8 +164,12 @@ entry:
; GCN-NOT: buffer_
; GCN: s_mov_b64 s[{{[0-9:]+}}], 0xffff
-; OPT: %0 = insertelement <4 x i16> undef, i16 1, i32 %sel2
-; OPT: store <4 x i16> %0, ptr addrspace(1) %out, align 2
+; OPT: %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+; OPT: %0 = load <4 x i16>, ptr addrspace(5) %alloca
+; OPT: %1 = insertelement <4 x i16> %0, i16 1, i32 %sel2
+; OPT: store <4 x i16> %1, ptr addrspace(5) %alloca
+; OPT: %load = load <4 x i16>, ptr addrspace(5) %alloca, align 2
+; OPT: store <4 x i16> %load, ptr addrspace(1) %out, align 2
define amdgpu_kernel void @short4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
entry:
@@ -172,7 +193,8 @@ entry:
; GCN-NOT: buffer_
; GCN: v_mov_b32_e32 v1, 0
-; OPT: ret i64 undef
+; OPT: %private_iptr = alloca <2 x i32>, align 8, addrspace(5)
+; OPT: %tmp1 = load i64, ptr addrspace(5) %private_iptr, align 8
define i64 @ptr_alloca_bitcast() {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll b/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
index 2e629a4d73d46..5651d1c922cc5 100644
--- a/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
@@ -12,6 +12,8 @@ target datalayout = "A5"
; FUNC-LABEL: @private_memory
; LOOP-NOT: alloca
+; LOOP: loop.header:
+; LOOP: br i1 %{{[^,]+}}, label %exit, label %loop.header
; FULL-UNROLL: alloca
; FULL-UNROLL-COUNT-256: store i32 {{[0-9]+}}, ptr addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
index c287196302f69..f91b5d6c2cbfe 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
@@ -1,7 +1,6 @@
-; RUN: opt -S -mtriple=amdgcn- -passes=sroa %s -o %t.sroa.ll
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %t.sroa.ll | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-ALLOCA %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %t.sroa.ll | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-PROMOTE %s
-; RUN: opt -S -mtriple=amdgcn-- -passes='sroa,amdgpu-promote-alloca,instcombine' < %s | FileCheck -check-prefix=OPT %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-ALLOCA %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-PROMOTE %s
+; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s
target datalayout = "A5"
@@ -76,7 +75,8 @@ entry:
; OPT-LABEL: @vector_write_read_bitcast_to_float(
; OPT-NOT: alloca
; OPT: bb2:
-; OPT: %0 = insertelement <6 x float> undef, float %tmp71, i32 %tmp10
+; OPT: %tmp.sroa.0.0 = phi <6 x float> [ undef, %bb ], [ %0, %bb2 ]
+; OPT: %0 = insertelement <6 x float> %tmp.sroa.0.0, float %tmp72, i32 %tmp10
; OPT: .preheader:
; OPT: %bc = bitcast <6 x float> %0 to <6 x i32>
; OPT: %1 = extractelement <6 x i32> %bc, i32 %tmp20
@@ -84,13 +84,24 @@ entry:
; GCN-LABEL: {{^}}vector_write_read_bitcast_to_float:
; GCN-ALLOCA: buffer_store_dword
-; GCN-PROMOTE: v_cmp_eq_u16
-; GCN-PROMOTE: v_cndmask
+; GCN-PROMOTE-COUNT-6: v_cmp_eq_u16
+; GCN-PROMOTE-COUNT-6: v_cndmask
; GCN: s_cbranch
; GCN-ALLOCA: buffer_load_dword
+; GCN-PROMOTE: v_cmp_eq_u16
+; GCN-PROMOTE: v_cndmask
+; GCN-PROMOTE: v_cmp_eq_u16
+; GCN-PROMOTE: v_cndmask
+; GCN-PROMOTE: v_cmp_eq_u16
+; GCN-PROMOTE: v_cndmask
+; GCN-PROMOTE: v_cmp_eq_u16
+; GCN-PROMOTE: v_cndmask
+; GCN-PROMOTE: v_cmp_eq_u16
+; GCN-PROMOTE: v_cndmask
+
; GCN-PROMOTE: ScratchSize: 0
define amdgpu_kernel void @vector_write_read_bitcast_to_float(ptr addrspace(1) %arg) {
@@ -136,7 +147,8 @@ bb15: ; preds = %.preheader
; OPT-LABEL: @vector_write_read_bitcast_to_double(
; OPT-NOT: alloca
; OPT: bb2:
-; OPT: %0 = insertelement <6 x double> undef, double %tmp71, i32 %tmp10
+; OPT: %tmp.sroa.0.0 = phi <6 x double> [ undef, %bb ], [ %0, %bb2 ]
+; OPT: %0 = insertelement <6 x double> %tmp.sroa.0.0, double %tmp72, i32 %tmp10
; OPT: .preheader:
; OPT: %bc = bitcast <6 x double> %0 to <6 x i64>
; OPT: %1 = extractelement <6 x i64> %bc, i32 %tmp20
@@ -196,7 +208,8 @@ bb15: ; preds = %.preheader
; OPT-LABEL: @vector_write_read_bitcast_to_i64(
; OPT-NOT: alloca
; OPT: bb2:
-; OPT: %0 = insertelement <6 x i64> undef, i64 %tmp6, i32 %tmp9
+; OPT: %tmp.sroa.0.0 = phi <6 x i64> [ undef, %bb ], [ %0, %bb2 ]
+; OPT: %0 = insertelement <6 x i64> %tmp.sroa.0.0, i64 %tmp6, i32 %tmp9
; OPT: .preheader:
; OPT: %1 = extractelement <6 x i64> %0, i32 %tmp18
@@ -259,7 +272,7 @@ bb13: ; preds = %.preheader
; OPT: store i32 %0, ptr addrspace(1) %out, align 4
; GCN-LABEL: {{^}}vector_read_alloca_bitcast_assume:
-; GCN-COUNT: buffer_store_dword
+; GCN-COUNT-4: buffer_store_dword
define amdgpu_kernel void @vector_read_alloca_bitcast_assume(ptr addrspace(1) %out, i32 %index) {
entry:
More information about the llvm-commits
mailing list