[llvm] [AMDGPU] Update PromoteAlloca to handle GEPs with variable offset. (PR #122342)

Thu Jan 30 13:47:57 PST 2025

https://github.com/sgundapa updated https://github.com/llvm/llvm-project/pull/122342

>From 1093642dcc9cfaf7b50bf314f18467df58ced7ab Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sumanth.gundapaneni at amd.com>
Date: Thu, 9 Jan 2025 10:59:05 -0800
Subject: [PATCH 1/2] [AMDGPU] Update PromoteAlloca to handle GEPs with
 variable offset.

In case of variable offset of a GEP that can be optimized out, promote
alloca is updated to use the refereshed index to avoid an assertion.

Issue found by fuzzer.
---
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 41 ++++++++++++++++---
 .../AMDGPU/promote-alloca-array-aggregate.ll  | 28 +++++++++++++
 2 files changed, 64 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index e27ef71c1c0883..1e32743c3dfeeb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -385,13 +385,42 @@ static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI,
          match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
 }
 
+static bool hasVariableOffset(GetElementPtrInst *GEP) {
+  // Iterate over all operands starting from the first index (index 0 is the
+  // base pointer).
+  for (unsigned i = 1, e = GEP->getNumOperands(); i != e; ++i) {
+    Value *Op = GEP->getOperand(i);
+    // Check if the operand is not a constant integer value
+    if (!isa<ConstantInt>(Op)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 static Value *
-calculateVectorIndex(Value *Ptr,
-                     const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
+calculateVectorIndex(Value *Ptr, std::map<GetElementPtrInst *, Value *> &GEPIdx,
+                     const DataLayout &DL) {
   auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts());
   if (!GEP)
     return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext()));
 
+  // If the index of this GEP is a variable that might be deleted,
+  // update the index with its latest value. We've already handled any GEPs
+  // with unsupported index types(in GEPToVectorIndex) at this point.
+  if (hasVariableOffset(GEP)) {
+    unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
+    SmallMapVector<Value *, APInt, 4> VarOffsets;
+    APInt ConstOffset(BW, 0);
+    if (GEP->collectOffset(DL, BW, VarOffsets, ConstOffset)) {
+      if (VarOffsets.size() == 1 && ConstOffset.isZero()) {
+        auto *UpdatedValue = VarOffsets.front().first;
+        GEPIdx[GEP] = UpdatedValue;
+        return UpdatedValue;
+      }
+    }
+  }
+
   auto I = GEPIdx.find(GEP);
   assert(I != GEPIdx.end() && "Must have entry for GEP!");
   return I->second;
@@ -496,7 +525,7 @@ static Value *promoteAllocaUserToVector(
     }
 
     Value *Index = calculateVectorIndex(
-        cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx);
+        cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx, DL);
 
     // We're loading the full vector.
     Type *AccessTy = Inst->getType();
@@ -552,7 +581,8 @@ static Value *promoteAllocaUserToVector(
     // to know the current value. If this is a store of a single element, we
     // need to know the value.
     StoreInst *SI = cast<StoreInst>(Inst);
-    Value *Index = calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx);
+    Value *Index =
+        calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx, DL);
     Value *Val = SI->getValueOperand();
 
     // We're storing the full vector, we can handle this without knowing CurVal.
@@ -850,7 +880,8 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
         if (Ptr != &Alloca && !GEPVectorIdx.count(GEP))
           return nullptr;
 
-        return dyn_cast<ConstantInt>(calculateVectorIndex(Ptr, GEPVectorIdx));
+        return dyn_cast<ConstantInt>(
+            calculateVectorIndex(Ptr, GEPVectorIdx, *DL));
       };
 
       unsigned OpNum = U->getOperandNo();
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
index 05c727201bbf1d..9db416041a5bc0 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
@@ -122,6 +122,34 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 {
   ret void
 }
 
+%Block4 = type { [2 x i32], i32 }
+ at block4 = external addrspace(1) global %Block4
+%gl_PV = type { <4 x i32>, i32, [1 x i32], [1 x i32] }
+ at pv1 = external addrspace(1) global %gl_PV
+
+; This should should not crash on variable offset that can be
+; optimized out (variable foo4 in the test)
+define amdgpu_vs void @promote_load_from_store_aggr_varoff() local_unnamed_addr {
+; CHECK-LABEL: @promote_load_from_store_aggr_varoff(
+; CHECK-NEXT:    [[FOO3_UNPACK2:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @block4, i64 8), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x i32> undef, i32 [[FOO3_UNPACK2]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i32> [[TMP1]], i32 [[FOO3_UNPACK2]]
+; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i64 3
+; CHECK-NEXT:    store <4 x i32> [[FOO12]], ptr addrspace(1) @pv1, align 16
+; CHECK-NEXT:    ret void
+;
+  %f1 = alloca [3 x i32], align 4, addrspace(5)
+  %G1 = getelementptr inbounds i8, ptr addrspace(5) %f1, i32 8
+  %foo3.unpack2 = load i32, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @block4, i64 8), align 4
+  store i32 %foo3.unpack2, ptr addrspace(5) %G1, align 4
+  %foo4 = load i32, ptr addrspace(5) %G1, align 4
+  %foo5 = getelementptr [3 x i32], ptr addrspace(5) %f1, i32 0, i32 %foo4
+  %foo6 = load i32, ptr addrspace(5) %foo5, align 4
+  %foo12 = insertelement <4 x i32> poison, i32 %foo6, i64 3
+  store <4 x i32> %foo12, ptr addrspace(1) @pv1, align 16
+  ret void
+}
+
 define amdgpu_vs void @promote_memmove_aggr() #0 {
 ; CHECK-LABEL: @promote_memmove_aggr(
 ; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) @pv, align 4

>From 9b7403a60b7772469dd786fc26c02702ade92f7b Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sumanth.gundapaneni at amd.com>
Date: Thu, 30 Jan 2025 13:42:59 -0800
Subject: [PATCH 2/2] [AMDGPU] Update PromoteAlloca to handle updated GEP
 Indexes.

The PromoteAlloca pass was using outdated cached GEP indices in some cases,
leading to an assertion failure when encountering aliased GEP indices that could
be optimized out. This commit fixes the issue by refreshing the cached index
before use.

Issue found by fuzzer.
---
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 95 +++++++++----------
 .../AMDGPU/promote-alloca-array-aggregate.ll  | 14 +--
 2 files changed, 51 insertions(+), 58 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 1e32743c3dfeeb..49b872506ccfc9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -385,57 +385,20 @@ static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI,
          match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
 }
 
-static bool hasVariableOffset(GetElementPtrInst *GEP) {
-  // Iterate over all operands starting from the first index (index 0 is the
-  // base pointer).
-  for (unsigned i = 1, e = GEP->getNumOperands(); i != e; ++i) {
-    Value *Op = GEP->getOperand(i);
-    // Check if the operand is not a constant integer value
-    if (!isa<ConstantInt>(Op)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-static Value *
-calculateVectorIndex(Value *Ptr, std::map<GetElementPtrInst *, Value *> &GEPIdx,
-                     const DataLayout &DL) {
-  auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts());
-  if (!GEP)
-    return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext()));
-
-  // If the index of this GEP is a variable that might be deleted,
-  // update the index with its latest value. We've already handled any GEPs
-  // with unsupported index types(in GEPToVectorIndex) at this point.
-  if (hasVariableOffset(GEP)) {
-    unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
-    SmallMapVector<Value *, APInt, 4> VarOffsets;
-    APInt ConstOffset(BW, 0);
-    if (GEP->collectOffset(DL, BW, VarOffsets, ConstOffset)) {
-      if (VarOffsets.size() == 1 && ConstOffset.isZero()) {
-        auto *UpdatedValue = VarOffsets.front().first;
-        GEPIdx[GEP] = UpdatedValue;
-        return UpdatedValue;
-      }
-    }
-  }
-
-  auto I = GEPIdx.find(GEP);
-  assert(I != GEPIdx.end() && "Must have entry for GEP!");
-  return I->second;
-}
-
-static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
-                               Type *VecElemTy, const DataLayout &DL) {
+static Value *GEPToVectorIndex(GetElementPtrInst *GEP, Type *VecElemTy,
+                               const DataLayout &DL,
+                               AllocaInst *Alloca = nullptr) {
   // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
   // helper.
   unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
   SmallMapVector<Value *, APInt, 4> VarOffsets;
   APInt ConstOffset(BW, 0);
-  if (GEP->getPointerOperand()->stripPointerCasts() != Alloca ||
-      !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
-    return nullptr;
+
+  bool CanCollect = GEP->collectOffset(DL, BW, VarOffsets, ConstOffset);
+
+  if (Alloca)
+    if (GEP->getPointerOperand()->stripPointerCasts() != Alloca || !CanCollect)
+      return nullptr;
 
   unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy);
   if (VarOffsets.size() > 1)
@@ -459,6 +422,36 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
   return ConstantInt::get(GEP->getContext(), Quot);
 }
 
+// Function to check if a Value is an operand of a GetElementPtrInst.
+static bool isValueInGEP(GetElementPtrInst *GEP, Value *ValueToCheck) {
+  if (!GEP || !ValueToCheck)
+    return false;
+
+  for (unsigned i = 0; i < GEP->getNumOperands(); ++i)
+    if (GEP->getOperand(i) == ValueToCheck)
+      return true;
+
+  return false;
+}
+
+static Value *
+calculateVectorIndex(Value *Ptr, std::map<GetElementPtrInst *, Value *> &GEPIdx,
+                     Type *VecElemTy, const DataLayout &DL) {
+  auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts());
+  if (!GEP)
+    return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext()));
+
+  // Update the cached index if the value is changed.
+  if (!isValueInGEP(GEP, GEPIdx[GEP])) {
+    Value *Index = GEPToVectorIndex(GEP, VecElemTy, DL);
+    GEPIdx[GEP] = Index;
+  }
+
+  auto I = GEPIdx.find(GEP);
+  assert(I != GEPIdx.end() && "Must have entry for GEP!");
+  return I->second;
+}
+
 /// Promotes a single user of the alloca to a vector form.
 ///
 /// \param Inst           Instruction to be promoted.
@@ -525,7 +518,7 @@ static Value *promoteAllocaUserToVector(
     }
 
     Value *Index = calculateVectorIndex(
-        cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx, DL);
+        cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx, VecEltTy, DL);
 
     // We're loading the full vector.
     Type *AccessTy = Inst->getType();
@@ -581,8 +574,8 @@ static Value *promoteAllocaUserToVector(
     // to know the current value. If this is a store of a single element, we
     // need to know the value.
     StoreInst *SI = cast<StoreInst>(Inst);
-    Value *Index =
-        calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx, DL);
+    Value *Index = calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx,
+                                        VecEltTy, DL);
     Value *Val = SI->getValueOperand();
 
     // We're storing the full vector, we can handle this without knowing CurVal.
@@ -845,7 +838,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
     if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
       // If we can't compute a vector index from this GEP, then we can't
       // promote this alloca to vector.
-      Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL);
+      Value *Index = GEPToVectorIndex(GEP, VecEltTy, *DL, &Alloca);
       if (!Index)
         return RejectUser(Inst, "cannot compute vector index for GEP");
 
@@ -881,7 +874,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
           return nullptr;
 
         return dyn_cast<ConstantInt>(
-            calculateVectorIndex(Ptr, GEPVectorIdx, *DL));
+            calculateVectorIndex(Ptr, GEPVectorIdx, VecEltTy, *DL));
       };
 
       unsigned OpNum = U->getOperandNo();
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
index 9db416041a5bc0..7ec3cab24d9a4d 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
@@ -127,14 +127,14 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 {
 %gl_PV = type { <4 x i32>, i32, [1 x i32], [1 x i32] }
 @pv1 = external addrspace(1) global %gl_PV
 
-; This should should not crash on variable offset that can be
-; optimized out (variable foo4 in the test)
-define amdgpu_vs void @promote_load_from_store_aggr_varoff() local_unnamed_addr {
+; This should not crash on an aliased variable offset that can be
+; optimized out (variable %aliasToG1 in the test)
+define amdgpu_vs void @promote_load_from_store_aggr_varoff(<4 x i32> %input) {
 ; CHECK-LABEL: @promote_load_from_store_aggr_varoff(
 ; CHECK-NEXT:    [[FOO3_UNPACK2:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @block4, i64 8), align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x i32> undef, i32 [[FOO3_UNPACK2]], i32 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i32> [[TMP1]], i32 [[FOO3_UNPACK2]]
-; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i64 3
+; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x i32> %input, i32 [[TMP2]], i64 3
 ; CHECK-NEXT:    store <4 x i32> [[FOO12]], ptr addrspace(1) @pv1, align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -142,10 +142,10 @@ define amdgpu_vs void @promote_load_from_store_aggr_varoff() local_unnamed_addr
   %G1 = getelementptr inbounds i8, ptr addrspace(5) %f1, i32 8
   %foo3.unpack2 = load i32, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @block4, i64 8), align 4
   store i32 %foo3.unpack2, ptr addrspace(5) %G1, align 4
-  %foo4 = load i32, ptr addrspace(5) %G1, align 4
-  %foo5 = getelementptr [3 x i32], ptr addrspace(5) %f1, i32 0, i32 %foo4
+  %aliasToG1 = load i32, ptr addrspace(5) %G1, align 4
+  %foo5 = getelementptr [3 x i32], ptr addrspace(5) %f1, i32 0, i32 %aliasToG1
   %foo6 = load i32, ptr addrspace(5) %foo5, align 4
-  %foo12 = insertelement <4 x i32> poison, i32 %foo6, i64 3
+  %foo12 = insertelement <4 x i32> %input, i32 %foo6, i64 3
   store <4 x i32> %foo12, ptr addrspace(1) @pv1, align 16
   ret void
 }