[llvm] [AMDGPU] Support i8/i16 GEP indices when promoting allocas to vectors (PR #175489)

Fri Feb 13 06:55:21 PST 2026

https://github.com/harrisonGPU updated https://github.com/llvm/llvm-project/pull/175489

>From fe1485c812c006083a1544340e1b752bc0c63021 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Mon, 12 Jan 2026 00:26:01 +0800
Subject: [PATCH] [AMDGPU] Support i8/i16 GEP indices when promoting allocas to
 vectors

---
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp |  55 +++++++--
 .../AMDGPU/promote-alloca-vector-gep.ll       | 113 ++++++++++++++++++
 2 files changed, 157 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index ed676c3fde2f8..f946db908cbca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -86,11 +86,13 @@ static cl::opt<unsigned>
                             "when sorting profitable allocas"),
                    cl::init(4));
 
-// We support vector indices of the form (A * stride) + B
-// All parts are optional.
+// We support vector indices of the form ((A * stride) >> shift) + B
+// VarIndex is A, VarMul is stride, VarShift is shift and ConstIndex is B. All
+// parts are optional.
 struct GEPToVectorIndex {
   Value *VarIndex = nullptr;         // defaults to 0
   ConstantInt *VarMul = nullptr;     // defaults to 1
+  ConstantInt *VarShift = nullptr;   // defaults to 0
   ConstantInt *ConstIndex = nullptr; // defaults to 0
   Value *Full = nullptr;
 };
@@ -491,6 +493,9 @@ static Value *calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA) {
 
       if (I->second.VarMul)
         Result = B.CreateMul(Result, I->second.VarMul);
+
+      if (I->second.VarShift)
+        Result = B.CreateAShr(Result, I->second.VarShift, "", /*isExact*/ true);
     }
 
     if (I->second.ConstIndex) {
@@ -551,24 +556,27 @@ computeGEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
   if (VarOffsets.size() > 1)
     return {};
 
-  APInt IndexQuot;
-  int64_t Rem;
-  APInt::sdivrem(ConstOffset, VecElemSize, IndexQuot, Rem);
-  if (Rem != 0)
+  // We support vector indices of the form ((A * stride) >> shift) + B.
+  // IndexQuot represents B. Check that the constant offset is a multiple
+  // of the vector element size.
+  if (ConstOffset.srem(VecElemSize) != 0)
     return {};
+  APInt IndexQuot = ConstOffset.sdiv(VecElemSize);
 
   GEPToVectorIndex Result;
 
   if (!ConstOffset.isZero())
     Result.ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW));
 
+  // If there are no variable offsets, only a constant offset, then we're done.
   if (VarOffsets.empty())
     return Result;
 
+  // Scale is the stride in the (A * stride) part. Check that there is only one
+  // variable offset and extract the scale factor.
   const auto &VarOffset = VarOffsets.front();
-  APInt OffsetQuot;
-  APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
-  if (Rem != 0 || OffsetQuot.isZero())
+  uint64_t Scale = VarOffset.second.getZExtValue();
+  if (Scale == 0)
     return {};
 
   Result.VarIndex = VarOffset.first;
@@ -576,8 +584,33 @@ computeGEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
   if (!OffsetType)
     return {};
 
-  if (!OffsetQuot.isOne())
-    Result.VarMul = ConstantInt::get(Ctx, OffsetQuot.sextOrTrunc(BW));
+  // The vector index for the variable part is: A * Scale / VecElemSize.
+  if (Scale >= (uint64_t)VecElemSize) {
+    // Scale is a multiple of VecElemSize, so the index is just A * (Scale /
+    // VecElemSize). Only the multiplier is needed.
+    if (Scale % VecElemSize != 0)
+      return {};
+
+    uint64_t VarMul = Scale / VecElemSize;
+    if (VarMul != 1)
+      Result.VarMul = ConstantInt::get(Ctx, APInt(BW, VarMul));
+  } else {
+    // VecElemSize is a multiple of Scale, so the index is A / (VecElemSize /
+    // Scale). The divisor must be a power of 2 so we can use a right shift, and
+    // A must be known to be divisible by that divisor.
+    if ((uint64_t)VecElemSize % Scale != 0)
+      return {};
+
+    uint64_t Divisor = VecElemSize / Scale;
+    if (!isPowerOf2_64(Divisor))
+      return {};
+
+    KnownBits KB = computeKnownBits(VarOffset.first, DL);
+    if (KB.countMinTrailingZeros() < Log2_64(Divisor))
+      return {};
+
+    Result.VarShift = ConstantInt::get(Ctx, APInt(BW, Log2_64(Divisor)));
+  }
 
   return Result;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
index 76e1868b3c4b9..d4afe25900871 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
@@ -250,6 +250,119 @@ bb2:
   store i32 0, ptr addrspace(5) %extractelement
   ret void
 }
+
+define amdgpu_ps void @scalar_alloca_vector_gep_i8_0_or_4(i1 %idx_sel) {
+; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_vector_gep_i8_0_or_4(
+; CHECK-SAME: i1 [[IDX_SEL:%.*]]) {
+; CHECK-NEXT:    [[ALLOCA:%.*]] = freeze <3 x float> poison
+; CHECK-NEXT:    [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 0, i32 4
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr exact i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> [[ALLOCA]], float 7.000000e+00, i32 [[TMP1]]
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca <3 x float>, align 16, addrspace(5)
+  %index = select i1 %idx_sel, i32 0, i32 4
+  %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index
+  store float 7.000000e+00, ptr addrspace(5) %elt, align 4
+  ret void
+}
+
+define amdgpu_ps void @scalar_alloca_vector_gep_i8_4_or_8(i1 %idx_sel) {
+; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_vector_gep_i8_4_or_8(
+; CHECK-SAME: i1 [[IDX_SEL:%.*]]) {
+; CHECK-NEXT:    [[ALLOCA:%.*]] = freeze <3 x float> poison
+; CHECK-NEXT:    [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 8
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr exact i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> [[ALLOCA]], float 7.000000e+00, i32 [[TMP1]]
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca <3 x float>, align 16, addrspace(5)
+  %index = select i1 %idx_sel, i32 4, i32 8
+  %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index
+  store float 7.000000e+00, ptr addrspace(5) %elt, align 4
+  ret void
+}
+
+define amdgpu_ps void @scalar_alloca_nested_vector_gep_i8_4_or_5_no_promote(i1 %idx_sel) {
+; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_nested_vector_gep_i8_4_or_5_no_promote(
+; CHECK-SAME: i1 [[IDX_SEL:%.*]]) {
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca <3 x float>, align 16, addrspace(5)
+; CHECK-NEXT:    [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 5
+; CHECK-NEXT:    [[ELT:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) [[ALLOCA]], i32 [[INDEX]]
+; CHECK-NEXT:    store float 7.000000e+00, ptr addrspace(5) [[ELT]], align 1
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca <3 x float>, align 16, addrspace(5)
+  %index = select i1 %idx_sel, i32 4, i32 5
+  %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index
+  store float 7.000000e+00, ptr addrspace(5) %elt, align 1
+  ret void
+}
+
+define amdgpu_ps void @scalar_alloca_vector_gep_i16_0_or_2(i1 %idx_sel) {
+; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_vector_gep_i16_0_or_2(
+; CHECK-SAME: i1 [[IDX_SEL:%.*]]) {
+; CHECK-NEXT:    [[ALLOCA:%.*]] = freeze <3 x float> poison
+; CHECK-NEXT:    [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 0, i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr exact i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> [[ALLOCA]], float 7.000000e+00, i32 [[TMP1]]
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca <3 x float>, align 16, addrspace(5)
+  %index = select i1 %idx_sel, i32 0, i32 2
+  %elt = getelementptr inbounds nuw i16, ptr addrspace(5) %alloca, i32 %index
+  store float 7.000000e+00, ptr addrspace(5) %elt, align 4
+  ret void
+}
+
+define amdgpu_ps void @scalar_alloca_vector_gep_i16_2_or_4(i1 %idx_sel) {
+; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_vector_gep_i16_2_or_4(
+; CHECK-SAME: i1 [[IDX_SEL:%.*]]) {
+; CHECK-NEXT:    [[ALLOCA:%.*]] = freeze <3 x float> poison
+; CHECK-NEXT:    [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 2, i32 4
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr exact i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> [[ALLOCA]], float 7.000000e+00, i32 [[TMP1]]
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca <3 x float>, align 16, addrspace(5)
+  %index = select i1 %idx_sel, i32 2, i32 4
+  %elt = getelementptr inbounds nuw i16, ptr addrspace(5) %alloca, i32 %index
+  store float 7.000000e+00, ptr addrspace(5) %elt, align 4
+  ret void
+}
+
+define amdgpu_ps void @scalar_alloca_vector_gep_i6_1_or_2_no_promote(i1 %idx_sel) {
+; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_vector_gep_i6_1_or_2_no_promote(
+; CHECK-SAME: i1 [[IDX_SEL:%.*]]) {
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca <3 x float>, align 16, addrspace(5)
+; CHECK-NEXT:    [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 1, i32 2
+; CHECK-NEXT:    [[ELT:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(5) [[ALLOCA]], i32 [[INDEX]]
+; CHECK-NEXT:    store float 7.000000e+00, ptr addrspace(5) [[ELT]], align 1
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca <3 x float>, align 16, addrspace(5)
+  %index = select i1 %idx_sel, i32 1, i32 2
+  %elt = getelementptr inbounds nuw i16, ptr addrspace(5) %alloca, i32 %index
+  store float 7.000000e+00, ptr addrspace(5) %elt, align 1
+  ret void
+}
+
+define amdgpu_ps void @scalar_alloca_vector_gep_i8_odd(i1 %idx_sel) {
+; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_vector_gep_i8_odd(
+; CHECK-SAME: i1 [[IDX_SEL:%.*]]) {
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca <3 x i24>, align 1, addrspace(5)
+; CHECK-NEXT:    [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 0, i32 3
+; CHECK-NEXT:    [[ELT:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) [[ALLOCA]], i32 [[INDEX]]
+; CHECK-NEXT:    store i8 7, ptr addrspace(5) [[ELT]], align 1
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca <3 x i24>, align 1, addrspace(5)
+  %index = select i1 %idx_sel, i32 0, i32 3
+  %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index
+  store i8 7, ptr addrspace(5) %elt, align 1
+  ret void
+}
+
 ;.
 ; CHECK: [[META0]] = !{}
 ; CHECK: [[RNG1]] = !{i32 0, i32 1025}