[llvm] [InstCombine] Preserve multi-dimensional array structure in GEP optimization (PR #176414)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 16 08:16:06 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Akash Dutta (akadutta)
<details>
<summary>Changes</summary>
These changes are needed to improve register pressure on AMDGPU backends. Commits 055bfc0 and 8a09adc simplified multi-dimensional array GEP operations into chains of single-index GEPs and stripped leading zeros. It causes significant performance drop on AMDGPUs by removing semantic information that the backend relies on. For example, multi-dimensional GEPs allow coalesced LDS reads that reduces register pressure.
If we use the example in https://github.com/llvm/llvm-project/issues/170477, flattened GEPs lead to a pattern of ds_read->spill to scratch. If we guard the GEP flattening for multi-dimensional arrays, it allows the backend to generate code such that ds_reads are immediately used.
---
Full diff: https://github.com/llvm/llvm-project/pull/176414.diff
4 Files Affected:
- (modified) llvm/lib/Transforms/InstCombine/InstructionCombining.cpp (+58-38)
- (modified) llvm/test/Transforms/InstCombine/canonicalize-gep-constglob.ll (+1-3)
- (modified) llvm/test/Transforms/InstCombine/strcmp-3.ll (+3-3)
- (modified) llvm/test/Transforms/InstCombine/strlen-8.ll (+5-5)
``````````diff
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index d24db3de8f7b3..0daf76227418b 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2921,12 +2921,21 @@ Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP,
Indices.append(GEP.op_begin() + 2, GEP.op_end());
// Don't create GEPs with more than one non-zero index.
- unsigned NumNonZeroIndices = count_if(Indices, [](Value *Idx) {
- auto *C = dyn_cast<Constant>(Idx);
- return !C || !C->isNullValue();
- });
- if (NumNonZeroIndices > 1)
- return nullptr;
+ // Exception: For AMDGPU, preserve multi-dimensional array structure for
+ // better backend optimization (memory coalescing, vectorization). Check if
+ // the source element type is a multi-dimensional array.
+ Type *GEPSrcElemTy = GEP.getSourceElementType();
+ bool IsMultiDimArray_Strip = GEPSrcElemTy->isArrayTy() &&
+ GEPSrcElemTy->getArrayElementType()->isArrayTy();
+
+ if (!IsMultiDimArray_Strip) {
+ unsigned NumNonZeroIndices = count_if(Indices, [](Value *Idx) {
+ auto *C = dyn_cast<Constant>(Idx);
+ return !C || !C->isNullValue();
+ });
+ if (NumNonZeroIndices > 1)
+ return nullptr;
+ }
return replaceInstUsesWith(
GEP, Builder.CreateGEP(
@@ -3364,17 +3373,24 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
drop_end(Indices), "", GEP.getNoWrapFlags()));
}
- // Strip leading zero indices.
- auto *FirstIdx = dyn_cast<Constant>(Indices.front());
- if (FirstIdx && FirstIdx->isNullValue() &&
- !FirstIdx->getType()->isVectorTy()) {
- gep_type_iterator GTI = gep_type_begin(GEP);
- ++GTI;
- if (!GTI.isStruct())
- return replaceInstUsesWith(GEP, Builder.CreateGEP(GTI.getIndexedType(),
- GEP.getPointerOperand(),
- drop_begin(Indices), "",
- GEP.getNoWrapFlags()));
+ // Strip leading zero indices (except for multi-dimensional arrays).
+ // Preserve structure for better backend optimization.
+ Type *GEPSrcElemTy = GEP.getSourceElementType();
+ bool IsMultiDimArray_Strip = GEPSrcElemTy->isArrayTy() &&
+ GEPSrcElemTy->getArrayElementType()->isArrayTy();
+
+ if (!IsMultiDimArray_Strip) {
+ auto *FirstIdx = dyn_cast<Constant>(Indices.front());
+ if (FirstIdx && FirstIdx->isNullValue() &&
+ !FirstIdx->getType()->isVectorTy()) {
+ gep_type_iterator GTI = gep_type_begin(GEP);
+ ++GTI;
+ if (!GTI.isStruct())
+ return replaceInstUsesWith(GEP, Builder.CreateGEP(GTI.getIndexedType(),
+ GEP.getPointerOperand(),
+ drop_begin(Indices), "",
+ GEP.getNoWrapFlags()));
+ }
}
// Scalarize vector operands; prefer splat-of-gep.as canonical form.
@@ -3403,29 +3419,33 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
return replaceInstUsesWith(GEP, Res);
}
- bool SeenNonZeroIndex = false;
- for (auto [IdxNum, Idx] : enumerate(Indices)) {
- auto *C = dyn_cast<Constant>(Idx);
- if (C && C->isNullValue())
- continue;
+ // GEP has multiple non-zero indices: Split it (except for multi-dim arrays).
+ // Preserve structure for better backend optimization.
+ if (!IsMultiDimArray_Strip) {
+ bool SeenNonZeroIndex = false;
+ for (auto [IdxNum, Idx] : enumerate(Indices)) {
+ auto *C = dyn_cast<Constant>(Idx);
+ if (C && C->isNullValue())
+ continue;
- if (!SeenNonZeroIndex) {
- SeenNonZeroIndex = true;
- continue;
- }
+ if (!SeenNonZeroIndex) {
+ SeenNonZeroIndex = true;
+ continue;
+ }
- // GEP has multiple non-zero indices: Split it.
- ArrayRef<Value *> FrontIndices = ArrayRef(Indices).take_front(IdxNum);
- Value *FrontGEP =
- Builder.CreateGEP(GEPEltType, PtrOp, FrontIndices,
- GEP.getName() + ".split", GEP.getNoWrapFlags());
-
- SmallVector<Value *> BackIndices;
- BackIndices.push_back(Constant::getNullValue(NewScalarIndexTy));
- append_range(BackIndices, drop_begin(Indices, IdxNum));
- return GetElementPtrInst::Create(
- GetElementPtrInst::getIndexedType(GEPEltType, FrontIndices), FrontGEP,
- BackIndices, GEP.getNoWrapFlags());
+ // GEP has multiple non-zero indices: Split it.
+ ArrayRef<Value *> FrontIndices = ArrayRef(Indices).take_front(IdxNum);
+ Value *FrontGEP =
+ Builder.CreateGEP(GEPEltType, PtrOp, FrontIndices,
+ GEP.getName() + ".split", GEP.getNoWrapFlags());
+
+ SmallVector<Value *> BackIndices;
+ BackIndices.push_back(Constant::getNullValue(NewScalarIndexTy));
+ append_range(BackIndices, drop_begin(Indices, IdxNum));
+ return GetElementPtrInst::Create(
+ GetElementPtrInst::getIndexedType(GEPEltType, FrontIndices), FrontGEP,
+ BackIndices, GEP.getNoWrapFlags());
+ }
}
// Check to see if the inputs to the PHI node are getelementptr instructions.
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-gep-constglob.ll b/llvm/test/Transforms/InstCombine/canonicalize-gep-constglob.ll
index 129da3f9110ad..6d238ae497d07 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-gep-constglob.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-gep-constglob.ll
@@ -35,9 +35,7 @@ define ptr @xzy(i64 %x, i64 %y, i64 %z) {
; CHECK-LABEL: define ptr @xzy(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], i64 [[Z:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[GEP_SPLIT:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr getelementptr inbounds nuw (i8, ptr @glob, i64 40), i64 [[X]]
-; CHECK-NEXT: [[GEP_SPLIT1:%.*]] = getelementptr inbounds [10 x i32], ptr [[GEP_SPLIT]], i64 [[Z]]
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[GEP_SPLIT1]], i64 [[Y]]
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr getelementptr inbounds nuw (i8, ptr @glob, i64 40), i64 0, i64 [[X]], i64 [[Z]], i64 [[Y]]
; CHECK-NEXT: ret ptr [[GEP]]
;
entry:
diff --git a/llvm/test/Transforms/InstCombine/strcmp-3.ll b/llvm/test/Transforms/InstCombine/strcmp-3.ll
index 72da736a0a9fd..2c4012b96e188 100644
--- a/llvm/test/Transforms/InstCombine/strcmp-3.ll
+++ b/llvm/test/Transforms/InstCombine/strcmp-3.ll
@@ -25,7 +25,7 @@ define i32 @fold_strcmp_a5i0_a5i1_to_0() {
define i32 @call_strcmp_a5i0_a5iI(i64 %I) {
; CHECK-LABEL: @call_strcmp_a5i0_a5iI(
-; CHECK-NEXT: [[Q:%.*]] = getelementptr [4 x i8], ptr @a5, i64 [[I:%.*]]
+; CHECK-NEXT: [[Q:%.*]] = getelementptr [5 x [4 x i8]], ptr @a5, i64 0, i64 [[I:%.*]]
; CHECK-NEXT: [[CMP:%.*]] = call i32 @strcmp(ptr noundef nonnull dereferenceable(4) @a5, ptr noundef nonnull dereferenceable(1) [[Q]])
; CHECK-NEXT: ret i32 [[CMP]]
;
@@ -40,7 +40,7 @@ define i32 @call_strcmp_a5i0_a5iI(i64 %I) {
define i32 @call_strcmp_a5iI_a5i0(i64 %I) {
; CHECK-LABEL: @call_strcmp_a5iI_a5i0(
-; CHECK-NEXT: [[P:%.*]] = getelementptr [4 x i8], ptr @a5, i64 [[I:%.*]]
+; CHECK-NEXT: [[P:%.*]] = getelementptr [5 x [4 x i8]], ptr @a5, i64 0, i64 [[I:%.*]]
; CHECK-NEXT: [[CMP:%.*]] = call i32 @strcmp(ptr noundef nonnull dereferenceable(1) [[P]], ptr noundef nonnull dereferenceable(4) @a5)
; CHECK-NEXT: ret i32 [[CMP]]
;
@@ -68,7 +68,7 @@ define i32 @fold_strcmp_a5i0_a5i1_p1_to_0() {
define i32 @call_strcmp_a5i0_a5i1_pI(i64 %I) {
; CHECK-LABEL: @call_strcmp_a5i0_a5i1_pI(
-; CHECK-NEXT: [[Q:%.*]] = getelementptr i8, ptr getelementptr inbounds nuw (i8, ptr @a5, i64 4), i64 [[I:%.*]]
+; CHECK-NEXT: [[Q:%.*]] = getelementptr [5 x [4 x i8]], ptr @a5, i64 0, i64 1, i64 [[I:%.*]]
; CHECK-NEXT: [[CMP:%.*]] = call i32 @strcmp(ptr noundef nonnull dereferenceable(4) @a5, ptr noundef nonnull dereferenceable(1) [[Q]])
; CHECK-NEXT: ret i32 [[CMP]]
;
diff --git a/llvm/test/Transforms/InstCombine/strlen-8.ll b/llvm/test/Transforms/InstCombine/strlen-8.ll
index af12198069803..b4334ddd8f1ac 100644
--- a/llvm/test/Transforms/InstCombine/strlen-8.ll
+++ b/llvm/test/Transforms/InstCombine/strlen-8.ll
@@ -16,7 +16,7 @@ declare i64 @strlen(ptr)
define i64 @fold_a5_4_i0_pI(i64 %I) {
; CHECK-LABEL: @fold_a5_4_i0_pI(
-; CHECK-NEXT: [[PTR:%.*]] = getelementptr i8, ptr @a5_4, i64 [[I:%.*]]
+; CHECK-NEXT: [[PTR:%.*]] = getelementptr [5 x [4 x i8]], ptr @a5_4, i64 0, i64 0, i64 [[I:%.*]]
; CHECK-NEXT: [[LEN:%.*]] = call i64 @strlen(ptr noundef nonnull dereferenceable(1) [[PTR]])
; CHECK-NEXT: ret i64 [[LEN]]
;
@@ -30,7 +30,7 @@ define i64 @fold_a5_4_i0_pI(i64 %I) {
define i64 @fold_a5_4_i1_pI(i64 %I) {
; CHECK-LABEL: @fold_a5_4_i1_pI(
-; CHECK-NEXT: [[PTR:%.*]] = getelementptr i8, ptr getelementptr inbounds nuw (i8, ptr @a5_4, i64 4), i64 [[I:%.*]]
+; CHECK-NEXT: [[PTR:%.*]] = getelementptr [5 x [4 x i8]], ptr @a5_4, i64 0, i64 1, i64 [[I:%.*]]
; CHECK-NEXT: [[LEN:%.*]] = call i64 @strlen(ptr noundef nonnull dereferenceable(1) [[PTR]])
; CHECK-NEXT: ret i64 [[LEN]]
;
@@ -44,7 +44,7 @@ define i64 @fold_a5_4_i1_pI(i64 %I) {
define i64 @fold_a5_4_i2_pI(i64 %I) {
; CHECK-LABEL: @fold_a5_4_i2_pI(
-; CHECK-NEXT: [[PTR:%.*]] = getelementptr i8, ptr getelementptr inbounds nuw (i8, ptr @a5_4, i64 8), i64 [[I:%.*]]
+; CHECK-NEXT: [[PTR:%.*]] = getelementptr [5 x [4 x i8]], ptr @a5_4, i64 0, i64 2, i64 [[I:%.*]]
; CHECK-NEXT: [[LEN:%.*]] = call i64 @strlen(ptr noundef nonnull dereferenceable(1) [[PTR]])
; CHECK-NEXT: ret i64 [[LEN]]
;
@@ -58,7 +58,7 @@ define i64 @fold_a5_4_i2_pI(i64 %I) {
define i64 @fold_a5_4_i3_pI_to_0(i64 %I) {
; CHECK-LABEL: @fold_a5_4_i3_pI_to_0(
-; CHECK-NEXT: [[PTR:%.*]] = getelementptr i8, ptr getelementptr inbounds nuw (i8, ptr @a5_4, i64 12), i64 [[I:%.*]]
+; CHECK-NEXT: [[PTR:%.*]] = getelementptr [5 x [4 x i8]], ptr @a5_4, i64 0, i64 3, i64 [[I:%.*]]
; CHECK-NEXT: [[LEN:%.*]] = call i64 @strlen(ptr noundef nonnull dereferenceable(1) [[PTR]])
; CHECK-NEXT: ret i64 [[LEN]]
;
@@ -72,7 +72,7 @@ define i64 @fold_a5_4_i3_pI_to_0(i64 %I) {
define i64 @fold_a5_4_i4_pI_to_0(i64 %I) {
; CHECK-LABEL: @fold_a5_4_i4_pI_to_0(
-; CHECK-NEXT: [[PTR:%.*]] = getelementptr i8, ptr getelementptr inbounds nuw (i8, ptr @a5_4, i64 16), i64 [[I:%.*]]
+; CHECK-NEXT: [[PTR:%.*]] = getelementptr [5 x [4 x i8]], ptr @a5_4, i64 0, i64 4, i64 [[I:%.*]]
; CHECK-NEXT: [[LEN:%.*]] = call i64 @strlen(ptr noundef nonnull dereferenceable(1) [[PTR]])
; CHECK-NEXT: ret i64 [[LEN]]
;
``````````
</details>
https://github.com/llvm/llvm-project/pull/176414
More information about the llvm-commits
mailing list