[llvm] [AMDGPU] PromoteAlloca: reject known out-of-bounds index (PR #139700)
Robert Imschweiler via llvm-commits
llvm-commits at lists.llvm.org
Tue May 13 03:04:48 PDT 2025
https://github.com/ro-i created https://github.com/llvm/llvm-project/pull/139700
Don't try to handle GEPs where the index is already known to be out-of-bounds -> avoid crash while generating shufflevector.
This LLVM defect was identified via the AMD Fuzzing project.
Note: The original (reduced) test case was:
```lvm
define amdgpu_vs void @promote_memcpy_two_aggrs() {
%f1 = alloca [5 x float], align 4, addrspace(5)
%G2 = getelementptr <1 x double>, ptr addrspace(5) %f1, i1 true
call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) %f1, ptr addrspace(5) %G2, i32 8, i1 false)
ret void
}
; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
declare void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) noalias writeonly captures(none), ptr addrspace(5) noalias readonly captures(none), i32, i1 immarg) #0
attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
```
This uncovers other potential issues, especially using `i1 1` (= `i1 true`) as an index. `getelementptr` sign extends the index value to the size of the pointer index type (i32), which in this case leads to a very large i32 value and *not* to `i32 1`. This might need a more general fix. But promote-alloca should nonetheless be able to handle wrong indices, I think. That's why I started with this PR.
>From eeae90f1a11e5e63b8f1b06fde693b0b767540bd Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Tue, 13 May 2025 04:49:58 -0500
Subject: [PATCH] [AMDGPU] PromoteAlloca: reject known out-of-bounds index
Don't try to handle GEPs where the index is already known to be
out-of-bounds -> avoid crash while generating shufflevector.
This LLVM defect was identified via the AMD Fuzzing project.
---
.../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 3 +-
.../test/CodeGen/AMDGPU/array-ptr-calc-i32.ll | 6 ++--
.../CodeGen/AMDGPU/promote-alloca-multidim.ll | 9 +++--
.../AMDGPU/promote-alloca-shufflevector.ll | 34 +++++++++++++++++++
4 files changed, 43 insertions(+), 9 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-shufflevector.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 933ee6ceeaf4a..69e1d952c7622 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -438,7 +438,8 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
SmallMapVector<Value *, APInt, 4> VarOffsets;
APInt ConstOffset(BW, 0);
if (GEP->getPointerOperand()->stripPointerCasts() != Alloca ||
- !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
+ !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset) ||
+ ConstOffset.getZExtValue() >= Alloca->getAllocationSize(DL))
return nullptr;
unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy);
diff --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
index e1bbc243344b0..ba9691b265b50 100644
--- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@@ -18,9 +18,9 @@ declare void @llvm.amdgcn.s.barrier() #2
; SI-ALLOCA: v_lshlrev_b32_e32 [[SIZE_SCALE:v[0-9]+]], 2, [[LOAD_A]]
; SI-ALLOCA: v_mov_b32_e32 [[PTRREG:v[0-9]+]], [[SIZE_SCALE]]
-; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64
+; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen
; SI-ALLOCA: s_barrier
-; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64
+; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen
;
; SI-PROMOTE: LDSByteSize: 0
define amdgpu_kernel void @test_private_array_ptr_calc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) #0 {
@@ -32,7 +32,7 @@ define amdgpu_kernel void @test_private_array_ptr_calc(ptr addrspace(1) noalias
%a = load i32, ptr addrspace(1) %a_ptr, !range !0, !noundef !{}
%b = load i32, ptr addrspace(1) %b_ptr, !range !0, !noundef !{}
%result = add i32 %a, %b
- %alloca_ptr = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 1, i32 %b
+ %alloca_ptr = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %b
store i32 %result, ptr addrspace(5) %alloca_ptr, align 4
; Dummy call
call void @llvm.amdgcn.s.barrier()
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
index d72f158763c61..810fcced208d1 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
@@ -312,13 +312,12 @@ define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset_index(ptr %out)
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <6 x i64> [[TMP14]], i64 4, i32 4
; CHECK-NEXT: [[TMP16:%.*]] = insertelement <6 x i64> [[TMP15]], i64 5, i32 5
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[SEL3]], 3
-; CHECK-NEXT: [[TMP2:%.*]] = add i64 6, [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP1]]
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <3 x i64> poison, i64 [[TMP3]], i64 0
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP2]], 1
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <3 x i64> [[TMP4]], i64 [[TMP6]], i64 1
-; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP2]], 2
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP1]], 2
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <3 x i64> [[TMP7]], i64 [[TMP9]], i64 2
; CHECK-NEXT: [[ELEM:%.*]] = extractelement <3 x i64> [[TMP10]], i32 2
@@ -337,7 +336,7 @@ define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset_index(ptr %out)
%gep.01 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
store <3 x i64> <i64 0, i64 1, i64 2>, ptr addrspace(5) %gep.00
store <3 x i64> <i64 3, i64 4, i64 5>, ptr addrspace(5) %gep.01
- %gep = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i64 1, i64 %sel3
+ %gep = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i64 0, i64 %sel3
%load = load <3 x i64>, ptr addrspace(5) %gep
%elem = extractelement <3 x i64> %load, i32 2
store i64 %elem, ptr %out
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-shufflevector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-shufflevector.ll
new file mode 100644
index 0000000000000..f4d65ec0f7a03
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-shufflevector.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple amdgcn -passes=amdgpu-promote-alloca-to-vector -S < %s | FileCheck %s
+
+; Skip promote-alloca in case of an index which is known to be out of bounds.
+
+define amdgpu_kernel void @out_of_bounds() {
+; CHECK-LABEL: define amdgpu_kernel void @out_of_bounds() {
+; CHECK-NEXT: [[PTR:%.*]] = alloca [4 x float], align 4, addrspace(5)
+; CHECK-NEXT: [[ELEM_PTR:%.*]] = getelementptr [4 x float], ptr addrspace(5) [[PTR]], i32 0, i32 42
+; CHECK-NEXT: call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) [[PTR]], ptr addrspace(5) [[ELEM_PTR]], i32 8, i1 false)
+; CHECK-NEXT: ret void
+;
+ %ptr = alloca [4 x float], align 4, addrspace(5)
+ %elem_ptr = getelementptr [4 x float], ptr addrspace(5) %ptr, i32 0, i32 42
+ call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) %ptr, ptr addrspace(5) %elem_ptr, i32 8, i1 false)
+ ret void
+}
+
+define amdgpu_kernel void @in_bounds() {
+; CHECK-LABEL: define amdgpu_kernel void @in_bounds() {
+; CHECK-NEXT: [[PTR:%.*]] = freeze <4 x float> poison
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[PTR]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+; CHECK-NEXT: ret void
+;
+ %ptr = alloca [4 x float], align 4, addrspace(5)
+ %elem_ptr = getelementptr [4 x float], ptr addrspace(5) %ptr, i32 0, i32 2
+ call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) %ptr, ptr addrspace(5) %elem_ptr, i32 8, i1 false)
+ ret void
+}
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) writeonly captures(none), ptr addrspace(5) readonly captures(none), i32, i1 immarg) #0
+
+attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
More information about the llvm-commits
mailing list