[llvm] [AMDGPU] Enable i8 GEP promotion for vector allocas (PR #166132)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 10 05:14:40 PST 2025
================
@@ -250,6 +250,150 @@ bb2:
store i32 0, ptr addrspace(5) %extractelement
ret void
}
+
+define amdgpu_kernel void @scalar_alloca_vector_gep_i8_0_or_4(ptr %buffer, float %data, i1 %idx_sel) {
+; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_0_or_4(
+; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) {
+; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <3 x float> poison
+; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
+; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 0, i32 4
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[INDEX]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[VEC]], float [[DATA]], i32 [[TMP1]]
+; CHECK-NEXT: store <3 x float> [[TMP2]], ptr [[BUFFER]], align 16
+; CHECK-NEXT: ret void
+;
+ %alloca = alloca <3 x float>, align 16, addrspace(5)
+ %vec = load <3 x float>, ptr %buffer
+ store <3 x float> %vec, ptr addrspace(5) %alloca
+ %index = select i1 %idx_sel, i32 0, i32 4
+ %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index
+ store float %data, ptr addrspace(5) %elt, align 4
+ %updated = load <3 x float>, ptr addrspace(5) %alloca, align 16
+ store <3 x float> %updated, ptr %buffer, align 16
+ ret void
+}
+
+define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_8(ptr %buffer, float %data, i1 %idx_sel) {
+; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_8(
+; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) {
+; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <3 x float> poison
+; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
+; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 8
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[INDEX]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[VEC]], float [[DATA]], i32 [[TMP1]]
+; CHECK-NEXT: store <3 x float> [[TMP2]], ptr [[BUFFER]], align 16
+; CHECK-NEXT: ret void
+;
+ %alloca = alloca <3 x float>, align 16, addrspace(5)
+ %vec = load <3 x float>, ptr %buffer
+ store <3 x float> %vec, ptr addrspace(5) %alloca
+ %index = select i1 %idx_sel, i32 4, i32 8
+ %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index
+ store float %data, ptr addrspace(5) %elt, align 4
+ %updated = load <3 x float>, ptr addrspace(5) %alloca, align 16
+ store <3 x float> %updated, ptr %buffer, align 16
+ ret void
+}
+
+define amdgpu_kernel void @scalar_alloca_nested_vector_gep_i8_4_or_8(ptr %buffer, float %data, i1 %idx_sel) {
+; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_nested_vector_gep_i8_4_or_8(
+; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) {
+; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <8 x float> poison
+; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[VEC]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x float> [[ALLOCA]], float [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x float> [[VEC]], i64 1
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x float> [[TMP2]], float [[TMP3]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <3 x float> [[VEC]], i64 2
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x float> [[TMP4]], float [[TMP5]], i32 2
+; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 8
+; CHECK-NEXT: [[TMP7:%.*]] = lshr i32 [[INDEX]], 2
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x float> [[TMP6]], float [[DATA]], i32 [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x float> [[TMP8]], i32 0
+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <3 x float> poison, float [[TMP9]], i64 0
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x float> [[TMP8]], i32 1
+; CHECK-NEXT: [[TMP12:%.*]] = insertelement <3 x float> [[TMP10]], float [[TMP11]], i64 1
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x float> [[TMP8]], i32 2
+; CHECK-NEXT: [[TMP14:%.*]] = insertelement <3 x float> [[TMP12]], float [[TMP13]], i64 2
+; CHECK-NEXT: store <3 x float> [[TMP14]], ptr [[BUFFER]], align 16
+; CHECK-NEXT: ret void
+;
+ %alloca = alloca [2 x <3 x float>], align 16, addrspace(5)
+ %row = getelementptr inbounds [2 x <3 x float>], ptr addrspace(5) %alloca, i32 0, i32 0
+ %vec = load <3 x float>, ptr %buffer
+ store <3 x float> %vec, ptr addrspace(5) %row, align 16
+ %index = select i1 %idx_sel, i32 4, i32 8
+ %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %row, i32 %index
+ store float %data, ptr addrspace(5) %elt, align 4
+ %updated = load <3 x float>, ptr addrspace(5) %row, align 16
+ store <3 x float> %updated, ptr %buffer, align 16
+ ret void
+}
+
+define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_5_no_promote(ptr %buffer, float %data, i1 %idx_sel) {
----------------
ruiling wrote:
Maybe switch to another calling convention so that the `alloca` will be kept unchanged (not being promoted to LDS)?
https://github.com/llvm/llvm-project/pull/166132
More information about the llvm-commits
mailing list