[llvm] [AMDGPU] Enable i8 GEP promotion for vector allocas (PR #166132)
Harrison Hao via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 3 00:10:01 PST 2025
https://github.com/harrisonGPU created https://github.com/llvm/llvm-project/pull/166132
This patch adds support for the pattern:
```llvm
%elt = getelementptr inbounds i8, ptr addrspace(5) %alloca, i32 %index
```
by scaling the byte offset to an element index (index >> log2(ElemSize)),
allowing the vector element to be updated with insertelement instead of using
scratch memory.
>From c1439a33ac7dd5b2d00b8344039afb243d92acf0 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Fri, 31 Oct 2025 11:13:00 +0800
Subject: [PATCH] [AMDGPU] Enable i8 GEP promotion for vector allocas
---
.../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 15 ++++++++++++--
.../AMDGPU/promote-alloca-vector-gep.ll | 20 +++++++++++++++++++
2 files changed, 33 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index ddabd25894414..793c0237cdf38 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -456,10 +456,21 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
const auto &VarOffset = VarOffsets.front();
APInt OffsetQuot;
APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
- if (Rem != 0 || OffsetQuot.isZero())
- return nullptr;
+
+ Value *Scaled = nullptr;
+ if (Rem != 0 || OffsetQuot.isZero()) {
+ unsigned ElemSizeShift = Log2_64(VecElemSize);
+ Scaled = Builder.CreateLShr(VarOffset.first, ElemSizeShift);
+ if (Instruction *NewInst = dyn_cast<Instruction>(Scaled))
+ NewInsts.push_back(NewInst);
+ OffsetQuot = APInt(BW, 1);
+ Rem = 0;
+ }
Value *Offset = VarOffset.first;
+ if (Scaled)
+ Offset = Scaled;
+
auto *OffsetType = dyn_cast<IntegerType>(Offset->getType());
if (!OffsetType)
return nullptr;
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
index 76e1868b3c4b9..65bddaba8dd14 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
@@ -250,6 +250,26 @@ bb2:
store i32 0, ptr addrspace(5) %extractelement
ret void
}
+
+define amdgpu_kernel void @scalar_alloca_vector_gep_i8(ptr %buffer, float %data, i32 %index) {
+; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8(
+; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i32 [[INDEX:%.*]]) {
+; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <3 x float> poison
+; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[INDEX]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[VEC]], float [[DATA]], i32 [[TMP1]]
+; CHECK-NEXT: store <3 x float> [[TMP2]], ptr [[BUFFER]], align 16
+; CHECK-NEXT: ret void
+;
+ %alloca = alloca <3 x float>, align 16, addrspace(5)
+ %vec = load <3 x float>, ptr %buffer
+ store <3 x float> %vec, ptr addrspace(5) %alloca
+ %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index
+ store float %data, ptr addrspace(5) %elt, align 4
+ %updated = load <3 x float>, ptr addrspace(5) %alloca, align 16
+ store <3 x float> %updated, ptr %buffer, align 16
+ ret void
+}
;.
; CHECK: [[META0]] = !{}
; CHECK: [[RNG1]] = !{i32 0, i32 1025}
More information about the llvm-commits
mailing list