[llvm-branch-commits] [llvm] AMDGPU: Fix producing invalid IR on vector typed getelementptr (PR #114113)

Tue Oct 29 11:28:47 PDT 2024

https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/114113

This did not consider the IR change to allow a scalar base with a vector
offset part. Reject any users that are not explicitly handled.

In this situation we could handle the vector GEP, but that is a larger
change. This just avoids the IR verifier error by rejecting it.

>From be705d086aa6fe501c7fc3c80d1ac63711e205f8 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 29 Oct 2024 10:26:10 -0700
Subject: [PATCH] AMDGPU: Fix producing invalid IR on vector typed
 getelementptr

This did not consider the IR change to allow a scalar base with a vector
offset part. Reject any users that are not explicitly handled.

In this situation we could handle the vector GEP, but that is a larger
change. This just avoids the IR verifier error by rejecting it.
---
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 18 ++++++--
 .../promote-alloca-invalid-vector-gep.ll      | 44 +++++++++++++++++++
 2 files changed, 58 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-invalid-vector-gep.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index f8744d6a483cff..7dd7388376f474 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -1159,7 +1159,6 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
     if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
       if (LI->isVolatile())
         return false;
-
       continue;
     }
 
@@ -1170,12 +1169,19 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
       // Reject if the stored value is not the pointer operand.
       if (SI->getPointerOperand() != Val)
         return false;
-    } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
+      continue;
+    }
+
+    if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
       if (RMW->isVolatile())
         return false;
-    } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
+      continue;
+    }
+
+    if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
       if (CAS->isVolatile())
         return false;
+      continue;
     }
 
     // Only promote a select if we know that the other select operand
@@ -1186,6 +1192,7 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
 
       // May need to rewrite constant operands.
       WorkList.push_back(ICmp);
+      continue;
     }
 
     // TODO: If we know the address is only observed through flat pointers, we
@@ -1198,8 +1205,9 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
     if (isa<InsertValueInst>(User) || isa<InsertElementInst>(User))
       return false;
 
+    // TODO: Handle vectors of pointers.
     if (!User->getType()->isPointerTy())
-      continue;
+      return false;
 
     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
       // Be conservative if an address could be computed outside the bounds of
@@ -1504,6 +1512,8 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
 
       PointerType *NewTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS);
 
+      assert(isa<PointerType>(V->getType()));
+
       // FIXME: It doesn't really make sense to try to do this for all
       // instructions.
       V->mutateType(NewTy);
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-invalid-vector-gep.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-invalid-vector-gep.ll
new file mode 100644
index 00000000000000..b0d578e421e280
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-invalid-vector-gep.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
+
+; Check that invalid IR is not produced on a vector typed
+; getelementptr with a scalar alloca pointer base.
+
+define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() {
+; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[ALLOCA]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[EXTRACTELEMENT:%.*]] = extractelement <4 x ptr addrspace(5)> [[GETELEMENTPTR]], i64 0
+; CHECK-NEXT:    store i32 0, ptr addrspace(5) [[EXTRACTELEMENT]], align 4
+; CHECK-NEXT:    ret void
+;
+bb:
+  %alloca = alloca i32, align 4, addrspace(5)
+  %getelementptr = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+  %extractelement = extractelement <4 x ptr addrspace(5)> %getelementptr, i64 0
+  store i32 0, ptr addrspace(5) %extractelement
+  ret void
+}
+
+define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select(i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select(
+; CHECK-SAME: i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
+; CHECK-NEXT:    [[GETELEMENTPTR0:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[ALLOCA]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[GETELEMENTPTR1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[ALLOCA]], <4 x i64> <i64 3, i64 2, i64 1, i64 0>
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], <4 x ptr addrspace(5)> [[GETELEMENTPTR0]], <4 x ptr addrspace(5)> [[GETELEMENTPTR1]]
+; CHECK-NEXT:    [[EXTRACTELEMENT:%.*]] = extractelement <4 x ptr addrspace(5)> [[SELECT]], i64 1
+; CHECK-NEXT:    store i32 0, ptr addrspace(5) [[EXTRACTELEMENT]], align 4
+; CHECK-NEXT:    ret void
+;
+bb:
+  %alloca = alloca i32, align 4, addrspace(5)
+  %getelementptr0 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+  %getelementptr1 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 3, i64 2, i64 1, i64 0>
+  %select = select i1 %cond, <4 x ptr addrspace(5)> %getelementptr0, <4 x ptr addrspace(5)> %getelementptr1
+  %extractelement = extractelement <4 x ptr addrspace(5)> %select, i64 1
+  store i32 0, ptr addrspace(5) %extractelement
+  ret void
+}