[llvm] [AMDGPU] Support i8/i16 GEP indices when promoting allocas to vectors (PR #175489)
Harrison Hao via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 13 06:55:21 PST 2026
https://github.com/harrisonGPU updated https://github.com/llvm/llvm-project/pull/175489
>From fe1485c812c006083a1544340e1b752bc0c63021 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Mon, 12 Jan 2026 00:26:01 +0800
Subject: [PATCH] [AMDGPU] Support i8/i16 GEP indices when promoting allocas to
vectors
---
.../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 55 +++++++--
.../AMDGPU/promote-alloca-vector-gep.ll | 113 ++++++++++++++++++
2 files changed, 157 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index ed676c3fde2f8..f946db908cbca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -86,11 +86,13 @@ static cl::opt<unsigned>
"when sorting profitable allocas"),
cl::init(4));
-// We support vector indices of the form (A * stride) + B
-// All parts are optional.
+// We support vector indices of the form ((A * stride) >> shift) + B
+// VarIndex is A, VarMul is stride, VarShift is shift and ConstIndex is B. All
+// parts are optional.
struct GEPToVectorIndex {
Value *VarIndex = nullptr; // defaults to 0
ConstantInt *VarMul = nullptr; // defaults to 1
+ ConstantInt *VarShift = nullptr; // defaults to 0
ConstantInt *ConstIndex = nullptr; // defaults to 0
Value *Full = nullptr;
};
@@ -491,6 +493,9 @@ static Value *calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA) {
if (I->second.VarMul)
Result = B.CreateMul(Result, I->second.VarMul);
+
+ if (I->second.VarShift)
+ Result = B.CreateAShr(Result, I->second.VarShift, "", /*isExact*/ true);
}
if (I->second.ConstIndex) {
@@ -551,24 +556,27 @@ computeGEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
if (VarOffsets.size() > 1)
return {};
- APInt IndexQuot;
- int64_t Rem;
- APInt::sdivrem(ConstOffset, VecElemSize, IndexQuot, Rem);
- if (Rem != 0)
+ // We support vector indices of the form ((A * stride) >> shift) + B.
+ // IndexQuot represents B. Check that the constant offset is a multiple
+ // of the vector element size.
+ if (ConstOffset.srem(VecElemSize) != 0)
return {};
+ APInt IndexQuot = ConstOffset.sdiv(VecElemSize);
GEPToVectorIndex Result;
if (!ConstOffset.isZero())
Result.ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW));
+ // If there are no variable offsets, only a constant offset, then we're done.
if (VarOffsets.empty())
return Result;
+ // Scale is the stride in the (A * stride) part. Check that there is only one
+ // variable offset and extract the scale factor.
const auto &VarOffset = VarOffsets.front();
- APInt OffsetQuot;
- APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
- if (Rem != 0 || OffsetQuot.isZero())
+ uint64_t Scale = VarOffset.second.getZExtValue();
+ if (Scale == 0)
return {};
Result.VarIndex = VarOffset.first;
@@ -576,8 +584,33 @@ computeGEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
if (!OffsetType)
return {};
- if (!OffsetQuot.isOne())
- Result.VarMul = ConstantInt::get(Ctx, OffsetQuot.sextOrTrunc(BW));
+ // The vector index for the variable part is: A * Scale / VecElemSize.
+ if (Scale >= (uint64_t)VecElemSize) {
+ // Scale is a multiple of VecElemSize, so the index is just A * (Scale /
+ // VecElemSize). Only the multiplier is needed.
+ if (Scale % VecElemSize != 0)
+ return {};
+
+ uint64_t VarMul = Scale / VecElemSize;
+ if (VarMul != 1)
+ Result.VarMul = ConstantInt::get(Ctx, APInt(BW, VarMul));
+ } else {
+ // VecElemSize is a multiple of Scale, so the index is A / (VecElemSize /
+ // Scale). The divisor must be a power of 2 so we can use a right shift, and
+ // A must be known to be divisible by that divisor.
+ if ((uint64_t)VecElemSize % Scale != 0)
+ return {};
+
+ uint64_t Divisor = VecElemSize / Scale;
+ if (!isPowerOf2_64(Divisor))
+ return {};
+
+ KnownBits KB = computeKnownBits(VarOffset.first, DL);
+ if (KB.countMinTrailingZeros() < Log2_64(Divisor))
+ return {};
+
+ Result.VarShift = ConstantInt::get(Ctx, APInt(BW, Log2_64(Divisor)));
+ }
return Result;
}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
index 76e1868b3c4b9..d4afe25900871 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
@@ -250,6 +250,119 @@ bb2:
store i32 0, ptr addrspace(5) %extractelement
ret void
}
+
+define amdgpu_ps void @scalar_alloca_vector_gep_i8_0_or_4(i1 %idx_sel) {
+; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_vector_gep_i8_0_or_4(
+; CHECK-SAME: i1 [[IDX_SEL:%.*]]) {
+; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <3 x float> poison
+; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 0, i32 4
+; CHECK-NEXT: [[TMP1:%.*]] = ashr exact i32 [[INDEX]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[ALLOCA]], float 7.000000e+00, i32 [[TMP1]]
+; CHECK-NEXT: ret void
+;
+ %alloca = alloca <3 x float>, align 16, addrspace(5)
+ %index = select i1 %idx_sel, i32 0, i32 4
+ %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index
+ store float 7.000000e+00, ptr addrspace(5) %elt, align 4
+ ret void
+}
+
+define amdgpu_ps void @scalar_alloca_vector_gep_i8_4_or_8(i1 %idx_sel) {
+; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_vector_gep_i8_4_or_8(
+; CHECK-SAME: i1 [[IDX_SEL:%.*]]) {
+; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <3 x float> poison
+; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 8
+; CHECK-NEXT: [[TMP1:%.*]] = ashr exact i32 [[INDEX]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[ALLOCA]], float 7.000000e+00, i32 [[TMP1]]
+; CHECK-NEXT: ret void
+;
+ %alloca = alloca <3 x float>, align 16, addrspace(5)
+ %index = select i1 %idx_sel, i32 4, i32 8
+ %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index
+ store float 7.000000e+00, ptr addrspace(5) %elt, align 4
+ ret void
+}
+
+define amdgpu_ps void @scalar_alloca_nested_vector_gep_i8_4_or_5_no_promote(i1 %idx_sel) {
+; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_nested_vector_gep_i8_4_or_5_no_promote(
+; CHECK-SAME: i1 [[IDX_SEL:%.*]]) {
+; CHECK-NEXT: [[ALLOCA:%.*]] = alloca <3 x float>, align 16, addrspace(5)
+; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 5
+; CHECK-NEXT: [[ELT:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) [[ALLOCA]], i32 [[INDEX]]
+; CHECK-NEXT: store float 7.000000e+00, ptr addrspace(5) [[ELT]], align 1
+; CHECK-NEXT: ret void
+;
+ %alloca = alloca <3 x float>, align 16, addrspace(5)
+ %index = select i1 %idx_sel, i32 4, i32 5
+ %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index
+ store float 7.000000e+00, ptr addrspace(5) %elt, align 1
+ ret void
+}
+
+define amdgpu_ps void @scalar_alloca_vector_gep_i16_0_or_2(i1 %idx_sel) {
+; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_vector_gep_i16_0_or_2(
+; CHECK-SAME: i1 [[IDX_SEL:%.*]]) {
+; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <3 x float> poison
+; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 0, i32 2
+; CHECK-NEXT: [[TMP1:%.*]] = ashr exact i32 [[INDEX]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[ALLOCA]], float 7.000000e+00, i32 [[TMP1]]
+; CHECK-NEXT: ret void
+;
+ %alloca = alloca <3 x float>, align 16, addrspace(5)
+ %index = select i1 %idx_sel, i32 0, i32 2
+ %elt = getelementptr inbounds nuw i16, ptr addrspace(5) %alloca, i32 %index
+ store float 7.000000e+00, ptr addrspace(5) %elt, align 4
+ ret void
+}
+
+define amdgpu_ps void @scalar_alloca_vector_gep_i16_2_or_4(i1 %idx_sel) {
+; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_vector_gep_i16_2_or_4(
+; CHECK-SAME: i1 [[IDX_SEL:%.*]]) {
+; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <3 x float> poison
+; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 2, i32 4
+; CHECK-NEXT: [[TMP1:%.*]] = ashr exact i32 [[INDEX]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[ALLOCA]], float 7.000000e+00, i32 [[TMP1]]
+; CHECK-NEXT: ret void
+;
+ %alloca = alloca <3 x float>, align 16, addrspace(5)
+ %index = select i1 %idx_sel, i32 2, i32 4
+ %elt = getelementptr inbounds nuw i16, ptr addrspace(5) %alloca, i32 %index
+ store float 7.000000e+00, ptr addrspace(5) %elt, align 4
+ ret void
+}
+
+define amdgpu_ps void @scalar_alloca_vector_gep_i6_1_or_2_no_promote(i1 %idx_sel) {
+; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_vector_gep_i6_1_or_2_no_promote(
+; CHECK-SAME: i1 [[IDX_SEL:%.*]]) {
+; CHECK-NEXT: [[ALLOCA:%.*]] = alloca <3 x float>, align 16, addrspace(5)
+; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 1, i32 2
+; CHECK-NEXT: [[ELT:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(5) [[ALLOCA]], i32 [[INDEX]]
+; CHECK-NEXT: store float 7.000000e+00, ptr addrspace(5) [[ELT]], align 1
+; CHECK-NEXT: ret void
+;
+ %alloca = alloca <3 x float>, align 16, addrspace(5)
+ %index = select i1 %idx_sel, i32 1, i32 2
+ %elt = getelementptr inbounds nuw i16, ptr addrspace(5) %alloca, i32 %index
+ store float 7.000000e+00, ptr addrspace(5) %elt, align 1
+ ret void
+}
+
+define amdgpu_ps void @scalar_alloca_vector_gep_i8_odd(i1 %idx_sel) {
+; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_vector_gep_i8_odd(
+; CHECK-SAME: i1 [[IDX_SEL:%.*]]) {
+; CHECK-NEXT: [[ALLOCA:%.*]] = alloca <3 x i24>, align 1, addrspace(5)
+; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 0, i32 3
+; CHECK-NEXT: [[ELT:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) [[ALLOCA]], i32 [[INDEX]]
+; CHECK-NEXT: store i8 7, ptr addrspace(5) [[ELT]], align 1
+; CHECK-NEXT: ret void
+;
+ %alloca = alloca <3 x i24>, align 1, addrspace(5)
+ %index = select i1 %idx_sel, i32 0, i32 3
+ %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index
+ store i8 7, ptr addrspace(5) %elt, align 1
+ ret void
+}
+
;.
; CHECK: [[META0]] = !{}
; CHECK: [[RNG1]] = !{i32 0, i32 1025}
More information about the llvm-commits
mailing list