[llvm] [AMDGPU] Extended vector promotion to aggregate types. (PR #143784)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 11 15:03:26 PDT 2025
https://github.com/zGoldthorpe updated https://github.com/llvm/llvm-project/pull/143784
>From 84c932d12386866bbf4af33a2540e0c3cb3a3091 Mon Sep 17 00:00:00 2001
From: Zach Goldthorpe <Zach.Goldthorpe at amd.com>
Date: Wed, 11 Jun 2025 15:19:20 -0500
Subject: [PATCH 1/2] Extended vector promotion to aggregate types.
---
.../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 96 ++++---
.../CodeGen/AMDGPU/promote-alloca-structs.ll | 263 ++++++++++++++++++
2 files changed, 318 insertions(+), 41 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 700dc87d2f821..336e3a1db7e73 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -818,6 +818,28 @@ static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
return I;
}
+/// Get the underlying type of a homogeneous aggregate type, or nullptr if the
+/// type is non-homogeneous.
+static Type *getHomogeneousType(Type *Ty) {
+ if (auto *VectorTy = dyn_cast<FixedVectorType>(Ty))
+ return VectorTy->getElementType();
+ if (auto *ArrayTy = dyn_cast<ArrayType>(Ty))
+ return getHomogeneousType(ArrayTy->getElementType());
+ if (auto *StructTy = dyn_cast<StructType>(Ty)) {
+ if (StructTy->getNumElements() == 0)
+ return nullptr;
+
+ auto *Iter = StructTy->element_begin();
+ Type *HTy = getHomogeneousType(*Iter);
+ for (; Iter != StructTy->element_end(); ++Iter)
+ if (getHomogeneousType(*Iter) != HTy)
+ return nullptr;
+
+ return HTy;
+ }
+ return Ty;
+}
+
// FIXME: Should try to pick the most likely to be profitable allocas first.
bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
@@ -828,42 +850,43 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
}
Type *AllocaTy = Alloca.getAllocatedType();
- auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
- if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
- uint64_t NumElems = 1;
- Type *ElemTy;
- do {
- NumElems *= ArrayTy->getNumElements();
- ElemTy = ArrayTy->getElementType();
- } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
-
- // Check for array of vectors
- auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
- if (InnerVectorTy) {
- NumElems *= InnerVectorTy->getNumElements();
- ElemTy = InnerVectorTy->getElementType();
- }
+ Type *ElemTy = getHomogeneousType(AllocaTy);
- if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
- unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
- if (ElementSize > 0) {
- unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
- // Expand vector if required to match padding of inner type,
- // i.e. odd size subvectors.
- // Storage size of new vector must match that of alloca for correct
- // behaviour of byte offsets and GEP computation.
- if (NumElems * ElementSize != AllocaSize)
- NumElems = AllocaSize / ElementSize;
- if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
- VectorTy = FixedVectorType::get(ElemTy, NumElems);
- }
- }
+ if (!ElemTy || !VectorType::isValidElementType(ElemTy)) {
+ LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
+ return false;
}
- if (!VectorTy) {
- LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
+ unsigned ElementSizeInBits = DL->getTypeSizeInBits(ElemTy);
+ if (ElementSizeInBits == 0) {
+ LLVM_DEBUG(dbgs() << " Cannot create vector of zero-sized elements.");
+ return false;
+ }
+ if (ElementSizeInBits != DL->getTypeAllocSizeInBits(ElemTy)) {
+ LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
+ "does not match the type's size\n");
return false;
}
+ unsigned ElementSize = ElementSizeInBits / 8;
+ if (ElementSize == 0)
+ return false;
+
+ // Calculate the size of the corresponding vector, accounting for padding of
+ // inner types, e.g., odd-sized subvectors. Storage size of new vector must
+ // match that of alloca for correct behaviour of byte offsets and GEP
+ // computation.
+ unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
+ unsigned NumElems = AllocaSize / ElementSize;
+ if (NumElems == 0) {
+ LLVM_DEBUG(dbgs() << " Cannot vectorize an empty aggregate type.");
+ return false;
+ }
+ if (NumElems * ElementSize != AllocaSize) {
+ LLVM_DEBUG(dbgs() << " Cannot convert type into vector of the same size.");
+ return false;
+ }
+ auto *VectorTy = FixedVectorType::get(ElemTy, NumElems);
+ assert(VectorTy && "Failed to create vector type.");
const unsigned MaxElements =
(MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());
@@ -895,15 +918,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n");
- Type *VecEltTy = VectorTy->getElementType();
- unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
- if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
- LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
- "does not match the type's size\n");
- return false;
- }
- unsigned ElementSize = ElementSizeInBits / 8;
- assert(ElementSize > 0);
for (auto *U : Uses) {
Instruction *Inst = cast<Instruction>(U->getUser());
@@ -943,7 +957,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
// If we can't compute a vector index from this GEP, then we can't
// promote this alloca to vector.
- Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
+ Value *Index = GEPToVectorIndex(GEP, &Alloca, ElemTy, *DL, NewGEPInsts);
if (!Index)
return RejectUser(Inst, "cannot compute vector index for GEP");
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll
new file mode 100644
index 0000000000000..d09f6ba1e7b68
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll
@@ -0,0 +1,263 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca-to-vector -amdgpu-promote-alloca-to-vector-limit=512 -amdgpu-promote-alloca-to-vector-max-regs=32 %s | FileCheck %s
+
+declare void @clobber_i8(i8)
+
+define void @test_v4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_v4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca <4 x i8>, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_a4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_a4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca [4 x i8], align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_a2v4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_a2v4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca [2 x <4 x i8>], align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_a2v3i8(i64 %idx) {
+; CHECK-LABEL: define void @test_a2v3i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca [2 x <3 x i8>], align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_a2a4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_a2a4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca [2 x [4 x i8]], align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_a2a3i8(i64 %idx) {
+; CHECK-LABEL: define void @test_a2a3i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <6 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <6 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca [2 x [3 x i8]], align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_s1v4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s1v4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca {<4 x i8>}, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_s1a4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s1a4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca {[4 x i8]}, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_s4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca {i8, i8, i8, i8}, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_s2v4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s2v4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca {<4 x i8>, <4 x i8>}, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_s2v2i8v4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s2v2i8v4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca {<2 x i8>, <4 x i8>}, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_s2v2i8v3i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s2v2i8v3i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca {<2 x i8>, <3 x i8>}, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_s2s2i8s4i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s2s2i8s4i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <6 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <6 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca {{i8, i8}, {i8, i8, i8, i8}}, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_s2s2i8s3i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s2s2i8s3i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <5 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <5 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca {{i8, i8}, {i8, i8, i8}}, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+define void @test_s3i8s1i8v2i8(i64 %idx) {
+; CHECK-LABEL: define void @test_s3i8s1i8v2i8(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca {i8, {i8}, <2 x i8>}, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+; heterogeneous element types are not supported
+define void @test_heterogeneous(i64 %idx) {
+; CHECK-LABEL: define void @test_heterogeneous(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = alloca { i8, i8, i16 }, align 4, addrspace(5)
+; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1
+; CHECK-NEXT: call void @clobber_i8(i8 [[VAL]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca {i8, i8, i16}, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+; empty structs are not supported
+define void @test_empty(i64 %idx) {
+; CHECK-LABEL: define void @test_empty(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = alloca { i8, {} }, align 4, addrspace(5)
+; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1
+; CHECK-NEXT: call void @clobber_i8(i8 [[VAL]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca {i8, {}}, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
>From ed920b76d0c77bf5e2aa5191cfca8c4b676d4dc9 Mon Sep 17 00:00:00 2001
From: Zach Goldthorpe <Zach.Goldthorpe at amd.com>
Date: Wed, 11 Jun 2025 17:00:31 -0500
Subject: [PATCH 2/2] Refactored away recursion.
---
.../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 39 ++++++++++++-------
.../CodeGen/AMDGPU/promote-alloca-structs.ll | 34 +++++++++++++++-
2 files changed, 58 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 336e3a1db7e73..ab1c3a5919ea1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -821,23 +821,34 @@ static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
/// Get the underlying type of a homogeneous aggregate type, or nullptr if the
/// type is non-homogeneous.
static Type *getHomogeneousType(Type *Ty) {
- if (auto *VectorTy = dyn_cast<FixedVectorType>(Ty))
- return VectorTy->getElementType();
- if (auto *ArrayTy = dyn_cast<ArrayType>(Ty))
- return getHomogeneousType(ArrayTy->getElementType());
- if (auto *StructTy = dyn_cast<StructType>(Ty)) {
- if (StructTy->getNumElements() == 0)
- return nullptr;
+ Type *ElemTy = nullptr;
+ SmallVector<Type *> WorkList;
+ WorkList.push_back(Ty);
+ while (!WorkList.empty()) {
+ Type *CurTy = WorkList.pop_back_val();
- auto *Iter = StructTy->element_begin();
- Type *HTy = getHomogeneousType(*Iter);
- for (; Iter != StructTy->element_end(); ++Iter)
- if (getHomogeneousType(*Iter) != HTy)
- return nullptr;
+ // Check if the current type is an aggregate type.
+ if (auto *VectorTy = dyn_cast<FixedVectorType>(CurTy)) {
+ WorkList.push_back(VectorTy->getElementType());
+ continue;
+ }
+ if (auto *ArrayTy = dyn_cast<ArrayType>(CurTy)) {
+ WorkList.push_back(ArrayTy->getElementType());
+ continue;
+ }
+ if (auto *StructTy = dyn_cast<StructType>(CurTy)) {
+ WorkList.append(StructTy->element_begin(), StructTy->element_end());
+ continue;
+ }
- return HTy;
+ // If not, it must be the same as all other non-aggregate types.
+ if (!ElemTy)
+ ElemTy = CurTy;
+ else if (ElemTy != CurTy)
+ return nullptr;
}
- return Ty;
+
+ return ElemTy;
}
// FIXME: Should try to pick the most likely to be profitable allocas first.
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll
index d09f6ba1e7b68..4840e451e4c4c 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll
@@ -228,6 +228,21 @@ define void @test_s3i8s1i8v2i8(i64 %idx) {
ret void
}
+define void @test_s3i8i8s0(i64 %idx) {
+; CHECK-LABEL: define void @test_s3i8i8s0(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = freeze <2 x i8> poison
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i8> [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: call void @clobber_i8(i8 [[TMP1]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca {i8, i8, {}}, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
; heterogeneous element types are not supported
define void @test_heterogeneous(i64 %idx) {
; CHECK-LABEL: define void @test_heterogeneous(
@@ -245,10 +260,27 @@ define void @test_heterogeneous(i64 %idx) {
ret void
}
-; empty structs are not supported
+; empty types are not supported
define void @test_empty(i64 %idx) {
; CHECK-LABEL: define void @test_empty(
; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[STACK:%.*]] = alloca {}, align 4, addrspace(5)
+; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]]
+; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1
+; CHECK-NEXT: call void @clobber_i8(i8 [[VAL]])
+; CHECK-NEXT: ret void
+;
+ %stack = alloca {}, align 4, addrspace(5)
+ %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+ %val = load i8, ptr addrspace(5) %ptr, align 1
+ call void @clobber_i8(i8 %val)
+ ret void
+}
+
+; singleton types are not supported
+define void @test_singleton(i64 %idx) {
+; CHECK-LABEL: define void @test_singleton(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
; CHECK-NEXT: [[STACK:%.*]] = alloca { i8, {} }, align 4, addrspace(5)
; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]]
; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1
More information about the llvm-commits
mailing list