[llvm] [AMDGPU] PromoteAlloca: reject known out-of-bounds index (PR #139700)

Tue May 13 03:04:48 PDT 2025

https://github.com/ro-i created https://github.com/llvm/llvm-project/pull/139700

Don't try to handle GEPs where the index is already known to be out-of-bounds -> avoid crash while generating shufflevector.

This LLVM defect was identified via the AMD Fuzzing project.

Note: The original (reduced) test case was:
```lvm
define amdgpu_vs void @promote_memcpy_two_aggrs() {
  %f1 = alloca [5 x float], align 4, addrspace(5)
  %G2 = getelementptr <1 x double>, ptr addrspace(5) %f1, i1 true
  call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) %f1, ptr addrspace(5) %G2, i32 8, i1 false)
  ret void
}

; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
declare void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) noalias writeonly captures(none), ptr addrspace(5) noalias readonly captures(none), i32, i1 immarg) #0

attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
```
This uncovers other potential issues, especially using `i1 1` (= `i1 true`) as an index. `getelementptr` sign extends the index value to the size of the pointer index type (i32), which in this case leads to a very large i32 value and *not* to `i32 1`. This might need a more general fix. But promote-alloca should nonetheless be able to handle wrong indices, I think. That's why I started with this PR.

>From eeae90f1a11e5e63b8f1b06fde693b0b767540bd Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Tue, 13 May 2025 04:49:58 -0500
Subject: [PATCH] [AMDGPU] PromoteAlloca: reject known out-of-bounds index

Don't try to handle GEPs where the index is already known to be
out-of-bounds -> avoid crash while generating shufflevector.

This LLVM defect was identified via the AMD Fuzzing project.
---
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp |  3 +-
 .../test/CodeGen/AMDGPU/array-ptr-calc-i32.ll |  6 ++--
 .../CodeGen/AMDGPU/promote-alloca-multidim.ll |  9 +++--
 .../AMDGPU/promote-alloca-shufflevector.ll    | 34 +++++++++++++++++++
 4 files changed, 43 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-shufflevector.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 933ee6ceeaf4a..69e1d952c7622 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -438,7 +438,8 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
   SmallMapVector<Value *, APInt, 4> VarOffsets;
   APInt ConstOffset(BW, 0);
   if (GEP->getPointerOperand()->stripPointerCasts() != Alloca ||
-      !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
+      !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset) ||
+      ConstOffset.getZExtValue() >= Alloca->getAllocationSize(DL))
     return nullptr;
 
   unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy);
diff --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
index e1bbc243344b0..ba9691b265b50 100644
--- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@@ -18,9 +18,9 @@ declare void @llvm.amdgcn.s.barrier() #2
 ; SI-ALLOCA: v_lshlrev_b32_e32 [[SIZE_SCALE:v[0-9]+]], 2, [[LOAD_A]]
 
 ; SI-ALLOCA: v_mov_b32_e32 [[PTRREG:v[0-9]+]], [[SIZE_SCALE]]
-; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64
+; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen
 ; SI-ALLOCA: s_barrier
-; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64
+; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen
 ;
 ; SI-PROMOTE: LDSByteSize: 0
 define amdgpu_kernel void @test_private_array_ptr_calc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) #0 {
@@ -32,7 +32,7 @@ define amdgpu_kernel void @test_private_array_ptr_calc(ptr addrspace(1) noalias
   %a = load i32, ptr addrspace(1) %a_ptr, !range !0, !noundef !{}
   %b = load i32, ptr addrspace(1) %b_ptr, !range !0, !noundef !{}
   %result = add i32 %a, %b
-  %alloca_ptr = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 1, i32 %b
+  %alloca_ptr = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %b
   store i32 %result, ptr addrspace(5) %alloca_ptr, align 4
   ; Dummy call
   call void @llvm.amdgcn.s.barrier()
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
index d72f158763c61..810fcced208d1 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
@@ -312,13 +312,12 @@ define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset_index(ptr %out)
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <6 x i64> [[TMP14]], i64 4, i32 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <6 x i64> [[TMP15]], i64 5, i32 5
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[SEL3]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 6, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <3 x i64> poison, i64 [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <3 x i64> [[TMP4]], i64 [[TMP6]], i64 1
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP1]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <3 x i64> [[TMP7]], i64 [[TMP9]], i64 2
 ; CHECK-NEXT:    [[ELEM:%.*]] = extractelement <3 x i64> [[TMP10]], i32 2
@@ -337,7 +336,7 @@ define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset_index(ptr %out)
   %gep.01 = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i32 0, i32 1, i32 0
   store <3 x i64> <i64 0, i64 1, i64 2>, ptr addrspace(5) %gep.00
   store <3 x i64> <i64 3, i64 4, i64 5>, ptr addrspace(5) %gep.01
-  %gep = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i64 1, i64 %sel3
+  %gep = getelementptr inbounds [2 x [3 x i64]], ptr addrspace(5) %alloca, i64 0, i64 %sel3
   %load = load <3 x i64>, ptr addrspace(5) %gep
   %elem = extractelement <3 x i64> %load, i32 2
   store i64 %elem, ptr %out
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-shufflevector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-shufflevector.ll
new file mode 100644
index 0000000000000..f4d65ec0f7a03
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-shufflevector.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple amdgcn -passes=amdgpu-promote-alloca-to-vector -S < %s | FileCheck %s
+
+; Skip promote-alloca in case of an index which is known to be out of bounds.
+
+define amdgpu_kernel void @out_of_bounds() {
+; CHECK-LABEL: define amdgpu_kernel void @out_of_bounds() {
+; CHECK-NEXT:    [[PTR:%.*]] = alloca [4 x float], align 4, addrspace(5)
+; CHECK-NEXT:    [[ELEM_PTR:%.*]] = getelementptr [4 x float], ptr addrspace(5) [[PTR]], i32 0, i32 42
+; CHECK-NEXT:    call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) [[PTR]], ptr addrspace(5) [[ELEM_PTR]], i32 8, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %ptr = alloca [4 x float], align 4, addrspace(5)
+  %elem_ptr = getelementptr [4 x float], ptr addrspace(5) %ptr, i32 0, i32 42
+  call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) %ptr, ptr addrspace(5) %elem_ptr, i32 8, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @in_bounds() {
+; CHECK-LABEL: define amdgpu_kernel void @in_bounds() {
+; CHECK-NEXT:    [[PTR:%.*]] = freeze <4 x float> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[PTR]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+; CHECK-NEXT:    ret void
+;
+  %ptr = alloca [4 x float], align 4, addrspace(5)
+  %elem_ptr = getelementptr [4 x float], ptr addrspace(5) %ptr, i32 0, i32 2
+  call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) %ptr, ptr addrspace(5) %elem_ptr, i32 8, i1 false)
+  ret void
+}
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) writeonly captures(none), ptr addrspace(5) readonly captures(none), i32, i1 immarg) #0
+
+attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }