[llvm] [SeparateConstOffsetFromGEP] - Fix the simplification of GEP index when its operand is an XOR constant. (PR #175724)
Pawan Nirpal via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 29 06:26:58 PST 2026
https://github.com/pawan-nirpal-031 updated https://github.com/llvm/llvm-project/pull/175724
>From ce6347d4e088fdc2941e1a2e267cf04b75d6276b Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pnirpal at qti.qualcomm.com>
Date: Tue, 16 Dec 2025 11:07:23 +0530
Subject: [PATCH 01/10] [AArch64] - Allow for aggressive unrolling, with
non-zero LoopMicroOpBufferSize for Oryon
---
llvm/lib/Target/AArch64/AArch64SchedOryon.td | 2 +-
.../aarch64-mcpu-oryon-runtime-unroll.ll | 152 ++++++++++++++++++
2 files changed, 153 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/AArch64/aarch64-mcpu-oryon-runtime-unroll.ll
diff --git a/llvm/lib/Target/AArch64/AArch64SchedOryon.td b/llvm/lib/Target/AArch64/AArch64SchedOryon.td
index 5b597b91e7459..435eaf99c6175 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedOryon.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedOryon.td
@@ -19,7 +19,7 @@ def OryonModel : SchedMachineModel {
let MicroOpBufferSize = 376;
let LoadLatency = 4;
let MispredictPenalty = 13; // 13 cycles for mispredicted branch.
- let LoopMicroOpBufferSize = 0; // Do not have a LoopMicroOpBuffer
+ let LoopMicroOpBufferSize = 16; // Oryon-1 does not have loop micro op buffer, we enable this pseudo value to allow for aggressive unrolling based on runtime TC.
let PostRAScheduler = 1; // Using PostRA sched.
let CompleteModel = 1;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mcpu-oryon-runtime-unroll.ll b/llvm/test/CodeGen/AArch64/aarch64-mcpu-oryon-runtime-unroll.ll
new file mode 100644
index 0000000000000..79136cf71c005
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-mcpu-oryon-runtime-unroll.ll
@@ -0,0 +1,152 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes='loop-unroll' -S %s | FileCheck %s --check-prefix=UNROLLED
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @foo(ptr %mat, ptr %sharr, ptr %barr, i16 %rows, i16 %dimout) #0 {
+; UNROLLED-LABEL: define void @foo(
+; UNROLLED-SAME: ptr [[MAT:%.*]], ptr [[SHARR:%.*]], ptr [[BARR:%.*]], i16 [[ROWS:%.*]], i16 [[DIMOUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; UNROLLED-NEXT: [[ENTRY:.*:]]
+; UNROLLED-NEXT: [[CMP33:%.*]] = icmp sgt i16 [[DIMOUT]], 0
+; UNROLLED-NEXT: br i1 [[CMP33]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_END22:.*]]
+; UNROLLED: [[FOR_BODY_LR_PH]]:
+; UNROLLED-NEXT: [[CMP631:%.*]] = icmp sgt i16 [[ROWS]], 0
+; UNROLLED-NEXT: br i1 [[CMP631]], label %[[FOR_BODY_US_PREHEADER:.*]], label %[[FOR_BODY_LR_PH_SPLIT:.*]]
+; UNROLLED: [[FOR_BODY_US_PREHEADER]]:
+; UNROLLED-NEXT: [[WIDE_TRIP_COUNT39:%.*]] = zext nneg i16 [[DIMOUT]] to i64
+; UNROLLED-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i16 [[ROWS]] to i64
+; UNROLLED-NEXT: [[TMP0:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1
+; UNROLLED-NEXT: br label %[[FOR_BODY_US:.*]]
+; UNROLLED: [[FOR_BODY_US]]:
+; UNROLLED-NEXT: [[INDVARS_IV36:%.*]] = phi i64 [ 0, %[[FOR_BODY_US_PREHEADER]] ], [ [[INDVARS_IV_NEXT37:%.*]], %[[FOR_COND3_FOR_INC20_CRIT_EDGE_US:.*]] ]
+; UNROLLED-NEXT: store i8 0, ptr [[BARR]], align 1
+; UNROLLED-NEXT: [[INVARIANT_GEP_US:%.*]] = getelementptr i8, ptr [[MAT]], i64 [[INDVARS_IV36]]
+; UNROLLED-NEXT: [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 1
+; UNROLLED-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
+; UNROLLED-NEXT: br i1 [[TMP1]], label %[[FOR_BODY8_US_EPIL_PREHEADER:.*]], label %[[FOR_BODY_US_NEW:.*]]
+; UNROLLED: [[FOR_BODY_US_NEW]]:
+; UNROLLED-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[XTRAITER]]
+; UNROLLED-NEXT: br label %[[FOR_BODY8_US:.*]]
+; UNROLLED: [[FOR_BODY8_US]]:
+; UNROLLED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_US_NEW]] ], [ [[INDVARS_IV_NEXT_1:%.*]], %[[FOR_INC_US_1:.*]] ]
+; UNROLLED-NEXT: [[TMP2:%.*]] = phi i8 [ 0, %[[FOR_BODY_US_NEW]] ], [ [[TMP8:%.*]], %[[FOR_INC_US_1]] ]
+; UNROLLED-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_US_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[FOR_INC_US_1]] ]
+; UNROLLED-NEXT: [[GEP_US:%.*]] = getelementptr [2 x i8], ptr [[INVARIANT_GEP_US]], i64 [[INDVARS_IV]]
+; UNROLLED-NEXT: [[TMP3:%.*]] = load i8, ptr [[GEP_US]], align 1
+; UNROLLED-NEXT: [[TOBOOL_NOT_US:%.*]] = icmp eq i8 [[TMP3]], 0
+; UNROLLED-NEXT: br i1 [[TOBOOL_NOT_US]], label %[[FOR_INC_US:.*]], label %[[IF_THEN_US:.*]]
+; UNROLLED: [[IF_THEN_US]]:
+; UNROLLED-NEXT: [[ARRAYIDX14_US:%.*]] = getelementptr inbounds nuw i8, ptr [[SHARR]], i64 [[INDVARS_IV]]
+; UNROLLED-NEXT: [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX14_US]], align 1
+; UNROLLED-NEXT: [[XOR30_US:%.*]] = xor i8 [[TMP2]], [[TMP4]]
+; UNROLLED-NEXT: store i8 [[XOR30_US]], ptr [[BARR]], align 1
+; UNROLLED-NEXT: br label %[[FOR_INC_US]]
+; UNROLLED: [[FOR_INC_US]]:
+; UNROLLED-NEXT: [[TMP5:%.*]] = phi i8 [ [[TMP2]], %[[FOR_BODY8_US]] ], [ [[XOR30_US]], %[[IF_THEN_US]] ]
+; UNROLLED-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; UNROLLED-NEXT: [[GEP_US_1:%.*]] = getelementptr [2 x i8], ptr [[INVARIANT_GEP_US]], i64 [[INDVARS_IV_NEXT]]
+; UNROLLED-NEXT: [[TMP6:%.*]] = load i8, ptr [[GEP_US_1]], align 1
+; UNROLLED-NEXT: [[TOBOOL_NOT_US_1:%.*]] = icmp eq i8 [[TMP6]], 0
+; UNROLLED-NEXT: br i1 [[TOBOOL_NOT_US_1]], label %[[FOR_INC_US_1]], label %[[IF_THEN_US_1:.*]]
+; UNROLLED: [[IF_THEN_US_1]]:
+; UNROLLED-NEXT: [[ARRAYIDX14_US_1:%.*]] = getelementptr inbounds nuw i8, ptr [[SHARR]], i64 [[INDVARS_IV_NEXT]]
+; UNROLLED-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX14_US_1]], align 1
+; UNROLLED-NEXT: [[XOR30_US_1:%.*]] = xor i8 [[TMP5]], [[TMP7]]
+; UNROLLED-NEXT: store i8 [[XOR30_US_1]], ptr [[BARR]], align 1
+; UNROLLED-NEXT: br label %[[FOR_INC_US_1]]
+; UNROLLED: [[FOR_INC_US_1]]:
+; UNROLLED-NEXT: [[TMP8]] = phi i8 [ [[TMP5]], %[[FOR_INC_US]] ], [ [[XOR30_US_1]], %[[IF_THEN_US_1]] ]
+; UNROLLED-NEXT: [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; UNROLLED-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2
+; UNROLLED-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; UNROLLED-NEXT: br i1 [[NITER_NCMP_1]], label %[[FOR_COND3_FOR_INC20_CRIT_EDGE_US_UNR_LCSSA:.*]], label %[[FOR_BODY8_US]]
+; UNROLLED: [[FOR_COND3_FOR_INC20_CRIT_EDGE_US_UNR_LCSSA]]:
+; UNROLLED-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1]], %[[FOR_INC_US_1]] ]
+; UNROLLED-NEXT: [[DOTUNR:%.*]] = phi i8 [ [[TMP8]], %[[FOR_INC_US_1]] ]
+; UNROLLED-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; UNROLLED-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY8_US_EPIL_PREHEADER]], label %[[FOR_COND3_FOR_INC20_CRIT_EDGE_US]]
+; UNROLLED: [[FOR_BODY8_US_EPIL_PREHEADER]]:
+; UNROLLED-NEXT: [[INDVARS_IV_EPIL_INIT:%.*]] = phi i64 [ 0, %[[FOR_BODY_US]] ], [ [[INDVARS_IV_UNR]], %[[FOR_COND3_FOR_INC20_CRIT_EDGE_US_UNR_LCSSA]] ]
+; UNROLLED-NEXT: [[DOTEPIL_INIT:%.*]] = phi i8 [ 0, %[[FOR_BODY_US]] ], [ [[DOTUNR]], %[[FOR_COND3_FOR_INC20_CRIT_EDGE_US_UNR_LCSSA]] ]
+; UNROLLED-NEXT: [[LCMP_MOD1:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; UNROLLED-NEXT: call void @llvm.assume(i1 [[LCMP_MOD1]])
+; UNROLLED-NEXT: br label %[[FOR_BODY8_US_EPIL:.*]]
+; UNROLLED: [[FOR_BODY8_US_EPIL]]:
+; UNROLLED-NEXT: [[GEP_US_EPIL:%.*]] = getelementptr [2 x i8], ptr [[INVARIANT_GEP_US]], i64 [[INDVARS_IV_EPIL_INIT]]
+; UNROLLED-NEXT: [[TMP9:%.*]] = load i8, ptr [[GEP_US_EPIL]], align 1
+; UNROLLED-NEXT: [[TOBOOL_NOT_US_EPIL:%.*]] = icmp eq i8 [[TMP9]], 0
+; UNROLLED-NEXT: br i1 [[TOBOOL_NOT_US_EPIL]], label %[[FOR_INC_US_EPIL:.*]], label %[[IF_THEN_US_EPIL:.*]]
+; UNROLLED: [[IF_THEN_US_EPIL]]:
+; UNROLLED-NEXT: [[ARRAYIDX14_US_EPIL:%.*]] = getelementptr inbounds nuw i8, ptr [[SHARR]], i64 [[INDVARS_IV_EPIL_INIT]]
+; UNROLLED-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX14_US_EPIL]], align 1
+; UNROLLED-NEXT: [[XOR30_US_EPIL:%.*]] = xor i8 [[DOTEPIL_INIT]], [[TMP10]]
+; UNROLLED-NEXT: store i8 [[XOR30_US_EPIL]], ptr [[BARR]], align 1
+; UNROLLED-NEXT: br label %[[FOR_INC_US_EPIL]]
+; UNROLLED: [[FOR_INC_US_EPIL]]:
+; UNROLLED-NEXT: br label %[[FOR_COND3_FOR_INC20_CRIT_EDGE_US]]
+; UNROLLED: [[FOR_COND3_FOR_INC20_CRIT_EDGE_US]]:
+; UNROLLED-NEXT: [[INDVARS_IV_NEXT37]] = add nuw nsw i64 [[INDVARS_IV36]], 1
+; UNROLLED-NEXT: [[EXITCOND40_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT37]], [[WIDE_TRIP_COUNT39]]
+; UNROLLED-NEXT: br i1 [[EXITCOND40_NOT]], label %[[FOR_END22_LOOPEXIT:.*]], label %[[FOR_BODY_US]]
+; UNROLLED: [[FOR_BODY_LR_PH_SPLIT]]:
+; UNROLLED-NEXT: store i8 0, ptr [[BARR]], align 1
+; UNROLLED-NEXT: br label %[[FOR_END22]]
+; UNROLLED: [[FOR_END22_LOOPEXIT]]:
+; UNROLLED-NEXT: br label %[[FOR_END22]]
+; UNROLLED: [[FOR_END22]]:
+; UNROLLED-NEXT: ret void
+;
+entry:
+ %cmp33 = icmp sgt i16 %dimout, 0
+ br i1 %cmp33, label %for.body.lr.ph, label %for.end22
+
+for.body.lr.ph: ; preds = %entry
+ %cmp631 = icmp sgt i16 %rows, 0
+ br i1 %cmp631, label %for.body.us.preheader, label %for.body.lr.ph.split
+
+for.body.us.preheader: ; preds = %for.body.lr.ph
+ %wide.trip.count39 = zext nneg i16 %dimout to i64
+ %wide.trip.count = zext nneg i16 %rows to i64
+ br label %for.body.us
+
+for.body.us: ; preds = %for.body.us.preheader, %for.cond3.for.inc20_crit_edge.us
+ %indvars.iv36 = phi i64 [ 0, %for.body.us.preheader ], [ %indvars.iv.next37, %for.cond3.for.inc20_crit_edge.us ]
+ store i8 0, ptr %barr, align 1
+ %invariant.gep.us = getelementptr i8, ptr %mat, i64 %indvars.iv36
+ br label %for.body8.us
+
+for.body8.us: ; preds = %for.body.us, %for.inc.us
+ %indvars.iv = phi i64 [ 0, %for.body.us ], [ %indvars.iv.next, %for.inc.us ]
+ %0 = phi i8 [ 0, %for.body.us ], [ %3, %for.inc.us ]
+ %gep.us = getelementptr [2 x i8], ptr %invariant.gep.us, i64 %indvars.iv
+ %1 = load i8, ptr %gep.us, align 1
+ %tobool.not.us = icmp eq i8 %1, 0
+ br i1 %tobool.not.us, label %for.inc.us, label %if.then.us
+
+if.then.us: ; preds = %for.body8.us
+ %arrayidx14.us = getelementptr inbounds nuw i8, ptr %sharr, i64 %indvars.iv
+ %2 = load i8, ptr %arrayidx14.us, align 1
+ %xor30.us = xor i8 %0, %2
+ store i8 %xor30.us, ptr %barr, align 1
+ br label %for.inc.us
+
+for.inc.us: ; preds = %if.then.us, %for.body8.us
+ %3 = phi i8 [ %0, %for.body8.us ], [ %xor30.us, %if.then.us ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond3.for.inc20_crit_edge.us, label %for.body8.us
+
+for.cond3.for.inc20_crit_edge.us: ; preds = %for.inc.us
+ %indvars.iv.next37 = add nuw nsw i64 %indvars.iv36, 1
+ %exitcond40.not = icmp eq i64 %indvars.iv.next37, %wide.trip.count39
+ br i1 %exitcond40.not, label %for.end22, label %for.body.us
+
+for.body.lr.ph.split: ; preds = %for.body.lr.ph
+ store i8 0, ptr %barr, align 1
+ br label %for.end22
+
+for.end22: ; preds = %for.cond3.for.inc20_crit_edge.us, %for.body.lr.ph.split, %entry
+ ret void
+}
+
+attributes #0 = { "target-cpu"="oryon-1" "target-features"="+neon,+sve" }
>From 905a2847d0e80377dc8a5a5813057a148d7c04a6 Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pnirpal at qti.qualcomm.com>
Date: Tue, 13 Jan 2026 14:15:36 +0530
Subject: [PATCH 02/10] [SeparateConstOffsetFromGEP] Restrict simplification of
XOR index when its base operand is constant
---
.../Scalar/SeparateConstOffsetFromGEP.cpp | 7 ++++++-
.../fix-const-xor-gep-simplify.ll | 17 +++++++++++++++++
2 files changed, 23 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index dc47b243625b8..d1ea825e33409 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -796,7 +796,12 @@ APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
ConstantInt *XorConstant;
// Match pattern: xor BaseOperand, Constant.
- if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant))))
+ // Restricting the BaseOperand to a compile time unknown.
+ // TODO : Formally prove the safety of BaseOperand to be compile time constant
+ // and extend the optimization for those cases.
+ if (!match(XorInst,
+ m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant))) ||
+ (isa<Constant>(BaseOperand)))
return APInt::getZero(BitWidth);
// Compute known bits for the base operand.
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
new file mode 100644
index 0000000000000..042d0aa03ed8a
--- /dev/null
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
@@ -0,0 +1,17 @@
+; RUN: opt -S -passes='separate-const-offset-from-gep' %s | FileCheck %s
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-gnu"
+
+define ptr @src(i32 %0) {
+; CHECK-LABEL: @src(
+; CHECK-NEXT: %base = alloca [4 x i32], align 16
+; CHECK-NEXT: %2 = xor i64 0, 3
+; CHECK-NEXT: %gep = getelementptr [4 x i32], ptr %base, i64 0, i64 %2
+; CHECK-NEXT: ret ptr %gep
+
+; CHECK-NOT: getelementptr i8, ptr %gep, i64 12
+ %base = alloca [4 x i32], align 16
+ %2 = xor i64 0, 3
+ %gep = getelementptr [4 x i32], ptr %base, i64 0, i64 %2
+ ret ptr %gep
+}
>From d43e9937f6b6cc8b7fca9f6cff2cfc578a55569d Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pawannirpal at gmail.com>
Date: Tue, 13 Jan 2026 14:29:41 +0530
Subject: [PATCH 03/10] Apply suggestion from @arsenm
Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
---
.../SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
index 042d0aa03ed8a..e923072174bea 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -passes='separate-const-offset-from-gep' %s | FileCheck %s
+; RUN: opt -S -passes=separate-const-offset-from-gep %s | FileCheck %s
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "aarch64-unknown-linux-gnu"
>From c5f2b69261afe193b415e2a7b007aac918da700c Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pawannirpal at gmail.com>
Date: Tue, 13 Jan 2026 14:30:12 +0530
Subject: [PATCH 04/10] Apply suggestion from @arsenm
Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
---
.../SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll | 2 --
1 file changed, 2 deletions(-)
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
index e923072174bea..ce1174b700d24 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
@@ -1,6 +1,4 @@
; RUN: opt -S -passes=separate-const-offset-from-gep %s | FileCheck %s
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
-target triple = "aarch64-unknown-linux-gnu"
define ptr @src(i32 %0) {
; CHECK-LABEL: @src(
>From caebb2e14495b8fd8e78d0facfa4640d25e0e460 Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pnirpal at qti.qualcomm.com>
Date: Tue, 13 Jan 2026 14:41:47 +0530
Subject: [PATCH 05/10] address comments 1
---
.../Scalar/SeparateConstOffsetFromGEP.cpp | 2 +-
.../fix-const-xor-gep-simplify.ll | 19 ++++++++++---------
2 files changed, 11 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index d1ea825e33409..2a1385c26763c 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -801,7 +801,7 @@ APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
// and extend the optimization for those cases.
if (!match(XorInst,
m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant))) ||
- (isa<Constant>(BaseOperand)))
+ (match(BaseOperand, m_ConstantInt())))
return APInt::getZero(BitWidth);
// Compute known bits for the base operand.
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
index ce1174b700d24..50b79aac72c5f 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
@@ -1,15 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt -S -passes=separate-const-offset-from-gep %s | FileCheck %s
define ptr @src(i32 %0) {
-; CHECK-LABEL: @src(
-; CHECK-NEXT: %base = alloca [4 x i32], align 16
-; CHECK-NEXT: %2 = xor i64 0, 3
-; CHECK-NEXT: %gep = getelementptr [4 x i32], ptr %base, i64 0, i64 %2
-; CHECK-NEXT: ret ptr %gep
-
-; CHECK-NOT: getelementptr i8, ptr %gep, i64 12
+; CHECK-LABEL: define ptr @src(
+; CHECK-SAME: i32 [[TMP0:%.*]]) {
+; CHECK-NEXT: [[BASE:%.*]] = alloca [4 x i32], align 16
+; CHECK-NEXT: [[TMP2:%.*]] = xor i64 0, 3
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr [4 x i32], ptr [[BASE]], i64 0, i64 [[TMP2]]
+; CHECK-NEXT: ret ptr [[GEP]]
+;
%base = alloca [4 x i32], align 16
- %2 = xor i64 0, 3
- %gep = getelementptr [4 x i32], ptr %base, i64 0, i64 %2
+ %xor = xor i64 0, 3
+ %gep = getelementptr [4 x i32], ptr %base, i64 0, i64 %xor
ret ptr %gep
}
>From 57e3c81cd8e63bc6ebac9be6647e477375941844 Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pnirpal at qti.qualcomm.com>
Date: Tue, 13 Jan 2026 21:51:54 +0530
Subject: [PATCH 06/10] refactor logic
---
.../Scalar/SeparateConstOffsetFromGEP.cpp | 17 ++++++-----
.../fix-const-xor-gep-simplify.ll | 30 +++++++++++++++----
2 files changed, 34 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 2a1385c26763c..10cc056eec0cc 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -729,8 +729,16 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
// with non-disjoint constant bits.
// TODO: The design should be updated to support partial constant
// extraction.
- if (BO->getOpcode() == Instruction::Xor)
+ if (BO->getOpcode() == Instruction::Xor) {
+ if (auto *ConstIOther = dyn_cast<ConstantInt>(TheOther)) {
+ const APInt &DisjointBits = extractDisjointBitsFromXor(BO);
+ const APInt &ConstantValue = ConstIOther->getValue();
+ const APInt &NonDisjointBits = ConstantValue & (~DisjointBits);
+ BO->setOperand(1 - OpNo,
+ ConstantInt::get(BO->getType(), NonDisjointBits));
+ }
return BO;
+ }
// If NextInChain is 0 and not the LHS of a sub, we can simplify the
// sub-expression to be just TheOther.
@@ -796,12 +804,7 @@ APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
ConstantInt *XorConstant;
// Match pattern: xor BaseOperand, Constant.
- // Restricting the BaseOperand to a compile time unknown.
- // TODO : Formally prove the safety of BaseOperand to be compile time constant
- // and extend the optimization for those cases.
- if (!match(XorInst,
- m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant))) ||
- (match(BaseOperand, m_ConstantInt())))
+ if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant))))
return APInt::getZero(BitWidth);
// Compute known bits for the base operand.
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
index 50b79aac72c5f..049463b33ac71 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
@@ -1,16 +1,34 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -S -passes=separate-const-offset-from-gep %s | FileCheck %s
+; RUN: opt -S -passes='separate-const-offset-from-gep<lower-gep>' %s | FileCheck %s
-define ptr @src(i32 %0) {
-; CHECK-LABEL: define ptr @src(
+define ptr @xor-const-test(i32 %0) {
+; CHECK-LABEL: define ptr @xor-const-test(
; CHECK-SAME: i32 [[TMP0:%.*]]) {
; CHECK-NEXT: [[BASE:%.*]] = alloca [4 x i32], align 16
-; CHECK-NEXT: [[TMP2:%.*]] = xor i64 0, 3
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr [4 x i32], ptr [[BASE]], i64 0, i64 [[TMP2]]
-; CHECK-NEXT: ret ptr [[GEP]]
+; CHECK-NEXT: [[XOR1:%.*]] = xor i64 0, 0
+; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[XOR1]], 2
+; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP3]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[UGLYGEP]], i64 12
+; CHECK-NEXT: ret ptr [[TMP2]]
;
%base = alloca [4 x i32], align 16
%xor = xor i64 0, 3
%gep = getelementptr [4 x i32], ptr %base, i64 0, i64 %xor
ret ptr %gep
}
+
+define ptr @xor-const-test2(i32 %0) {
+; CHECK-LABEL: define ptr @xor-const-test2(
+; CHECK-SAME: i32 [[TMP0:%.*]]) {
+; CHECK-NEXT: [[BASE:%.*]] = alloca [4 x i32], align 16
+; CHECK-NEXT: [[XOR1:%.*]] = xor i64 2, 2
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[XOR1]], 2
+; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP2]]
+; CHECK-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[UGLYGEP]], i64 4
+; CHECK-NEXT: ret ptr [[UGLYGEP2]]
+;
+ %base = alloca [4 x i32], align 16
+ %xor = xor i64 2, 3
+ %gep = getelementptr [4 x i32], ptr %base, i64 0, i64 %xor
+ ret ptr %gep
+}
>From 5964193743ed792a02bace52d14123aa730e2a96 Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pnirpal at qti.qualcomm.com>
Date: Wed, 21 Jan 2026 13:26:23 +0530
Subject: [PATCH 07/10] Add explainatory comments
---
.../Scalar/SeparateConstOffsetFromGEP.cpp | 43 ++++++++++++++++++-
1 file changed, 41 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 10cc056eec0cc..172f47fa852f7 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -725,12 +725,51 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
if (CI->isZero()) {
- // Custom XOR handling for disjoint bits - preserves original XOR
- // with non-disjoint constant bits.
+ // Custom XOR handling for disjoint bits.
// TODO: The design should be updated to support partial constant
// extraction.
if (BO->getOpcode() == Instruction::Xor) {
if (auto *ConstIOther = dyn_cast<ConstantInt>(TheOther)) {
+ // Purpose:
+ // We want to compute/simplify the expression:
+ // b + scale * (a ^ c)
+ // Here a and c are both constants.
+
+ // Transform:
+ // We must partition c into disjoint and non-disjoint components and
+ // only XOR the non-disjoint bits with a:
+
+ // non_disjoint(c) = c & ~disjoint(c)
+
+ // Therefore the correct form is:
+ // b + ((a ^ non_disjoint(c)) + disjoint(c)) * scale
+
+ // Rationale:
+ // - Bits of c that are disjoint from a (i.e., where a is known
+ // zero)
+ // pass through unchanged (added, not XORed).
+ // - Only the overlapping (non-disjoint) bits of c should
+ // participate
+ // in the XOR with a.
+
+ // Example:
+ // a = 0
+ // c = 3
+ // scale = 4
+
+ // Expected:
+ // b + scale * (a ^ c)
+ // = b + 4 * (0 ^ 3)
+ // = b + 4 * 3
+ // = b + 12
+
+ // Transform:
+ // non_disjoint(3) = 3 & ~3 = 0
+ // b + ((0 ^ non_disjoint(3)) + disjoint(3)) * 4
+ // = b + ((0 ^ 0) + 3) * 4
+ // = b + 3 * 4
+ // = b + 12
+
const APInt &DisjointBits = extractDisjointBitsFromXor(BO);
const APInt &ConstantValue = ConstIOther->getValue();
const APInt &NonDisjointBits = ConstantValue & (~DisjointBits);
>From 60cdac52f7143ff0f14b06fe2004dc92568f20ca Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pnirpal at qti.qualcomm.com>
Date: Wed, 21 Jan 2026 13:29:55 +0530
Subject: [PATCH 08/10] comment
---
llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp | 3 +++
1 file changed, 3 insertions(+)
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 172f47fa852f7..d5a951145b949 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -744,6 +744,9 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
// Therefore the correct form is:
// b + ((a ^ non_disjoint(c)) + disjoint(c)) * scale
+ // And not b + ((a ^ (c)) + disjoint(c)) * scale, which ignores the
+ // non_disjoint split of the constant c which is incorrect.
+
// Rationale:
// - Bits of c that are disjoint from a (i.e., where a is known
// zero)
>From c06cf41488b650a29e7433b406b49ee798d61b2f Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pnirpal at qti.qualcomm.com>
Date: Thu, 29 Jan 2026 19:46:07 +0530
Subject: [PATCH 09/10] Address comments
---
.../Scalar/SeparateConstOffsetFromGEP.cpp | 137 +++++++++---------
1 file changed, 66 insertions(+), 71 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index d5a951145b949..c91d3bf2abecf 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -297,7 +297,7 @@ class ConstantOffsetExtractor {
/// Analyze XOR instruction to extract disjoint constant bits that behave
/// like addition operations for improved address mode folding.
- APInt extractDisjointBitsFromXor(BinaryOperator *XorInst);
+ std::pair<APInt, APInt> extractDisjointBitsFromXor(BinaryOperator *XorInst);
/// The path from the constant offset to the old GEP index. e.g., if the GEP
/// index is "a * b + (c + 5)". After running function find, UserChain[0] will
@@ -602,8 +602,10 @@ APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative))
ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
// Handle XOR with disjoint bits that can be treated as addition.
- else if (BO->getOpcode() == Instruction::Xor)
- ConstantOffset = extractDisjointBitsFromXor(BO);
+ else if (BO->getOpcode() == Instruction::Xor) {
+ auto [DisjointBits, NonDisjointBits] = extractDisjointBitsFromXor(BO);
+ ConstantOffset = DisjointBits;
+ }
} else if (isa<TruncInst>(V)) {
ConstantOffset =
find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative)
@@ -723,72 +725,65 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
Value *NextInChain = removeConstOffset(ChainIndex - 1);
Value *TheOther = BO->getOperand(1 - OpNo);
- if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
- if (CI->isZero()) {
- // Custom XOR handling for disjoint bits.
- // TODO: The design should be updated to support partial constant
- // extraction.
- if (BO->getOpcode() == Instruction::Xor) {
- if (auto *ConstIOther = dyn_cast<ConstantInt>(TheOther)) {
- // Purpose:
- // We want to compute/simplify the expression:
- // b + scale * (a ^ c)
- // Here a and c are both constants.
-
- // Transform:
- // We must partition c into disjoint and non-disjoint components and
- // only XOR the non-disjoint bits with a:
-
- // non_disjoint(c) = c & ~disjoint(c)
-
- // Therefore the correct form is:
- // b + ((a ^ non_disjoint(c)) + disjoint(c)) * scale
-
- // And not b + ((a ^ (c)) + disjoint(c)) * scale, which ignores the
- // non_disjoint split of the constant c which is incorrect.
-
- // Rationale:
- // - Bits of c that are disjoint from a (i.e., where a is known
- // zero)
- // pass through unchanged (added, not XORed).
- // - Only the overlapping (non-disjoint) bits of c should
- // participate
- // in the XOR with a.
-
- // Example:
- // a = 0
- // c = 3
- // scale = 4
-
- // Expected:
- // b + scale * (a ^ c)
- // = b + 4 * (0 ^ 3)
- // = b + 4 * 3
- // = b + 12
-
- // Transform:
- // non_disjoint(3) = 3 & ~3 = 0
- // b + ((0 ^ non_disjoint(3)) + disjoint(3)) * 4
- // = b + ((0 ^ 0) + 3) * 4
- // = b + 3 * 4
- // = b + 12
-
- const APInt &DisjointBits = extractDisjointBitsFromXor(BO);
- const APInt &ConstantValue = ConstIOther->getValue();
- const APInt &NonDisjointBits = ConstantValue & (~DisjointBits);
- BO->setOperand(1 - OpNo,
- ConstantInt::get(BO->getType(), NonDisjointBits));
- }
- return BO;
- }
-
- // If NextInChain is 0 and not the LHS of a sub, we can simplify the
- // sub-expression to be just TheOther.
- if (!(BO->getOpcode() == Instruction::Sub && OpNo == 0))
- return TheOther;
+ // Custom XOR handling for disjoint bits.
+ // TODO: The design should be updated to support partial constant
+ // extraction.
+ if (BO->getOpcode() == Instruction::Xor) {
+ if (dyn_cast<ConstantInt>(TheOther)) {
+ // Purpose:
+ // We want to compute/simplify the expression:
+ // b + scale * (a ^ c)
+ // Here a and c are both constants.
+
+ // Transform:
+ // We must partition c into disjoint and non-disjoint components and
+ // only XOR the non-disjoint bits with a:
+
+ // non_disjoint(c) = c & ~disjoint(c)
+
+ // Therefore the correct form is:
+ // b + ((a ^ non_disjoint(c)) + disjoint(c)) * scale
+
+ // And not b + ((a ^ (c)) + disjoint(c)) * scale, which ignores the
+ // non_disjoint split of the constant c which is incorrect.
+
+ // Rationale:
+ // - Bits of c that are disjoint from a (i.e., where a is known
+ // zero)
+ // pass through unchanged (added, not XORed).
+ // - Only the overlapping (non-disjoint) bits of c should
+ // participate in the XOR with a.
+
+ // Example:
+ // a = 0
+ // c = 3
+ // scale = 4
+
+ // Expected:
+ // b + scale * (a ^ c)
+ // = b + 4 * (0 ^ 3)
+ // = b + 4 * 3
+ // = b + 12
+
+ // Transform:
+ // non_disjoint(3) = 3 & ~3 = 0
+ // b + ((0 ^ non_disjoint(3)) + disjoint(3)) * 4
+ // = b + ((0 ^ 0) + 3) * 4
+ // = b + 3 * 4
+ // = b + 12
+
+ auto [DisjointBits, NonDisjointBits] = extractDisjointBitsFromXor(BO);
+ BO->setOperand(1 - OpNo,
+ ConstantInt::get(BO->getType(), NonDisjointBits));
}
+ return BO;
}
+ // If NextInChain is 0 and not the LHS of a sub, we can simplify the
+ // sub-expression to be just TheOther.
+ if (!(BO->getOpcode() == Instruction::Sub && OpNo == 0))
+ return TheOther;
+
BinaryOperator::BinaryOps NewOp = BO->getOpcode();
if (BO->getOpcode() == Instruction::Or) {
// Rebuild "or" as "add", because "or" may be invalid for the new
@@ -836,8 +831,8 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
/// \param XorInst The XOR binary operator to analyze
/// \return APInt containing the disjoint bits that can be extracted as offset,
/// or zero if no disjoint bits exist
-APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
- BinaryOperator *XorInst) {
+std::pair<APInt, APInt>
+ConstantOffsetExtractor::extractDisjointBitsFromXor(BinaryOperator *XorInst) {
assert(XorInst && XorInst->getOpcode() == Instruction::Xor &&
"Expected XOR instruction");
@@ -847,7 +842,7 @@ APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
// Match pattern: xor BaseOperand, Constant.
if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant))))
- return APInt::getZero(BitWidth);
+ return {APInt::getZero(BitWidth), APInt::getZero(BitWidth)};
// Compute known bits for the base operand.
const SimplifyQuery SQ(DL);
@@ -859,7 +854,7 @@ APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
// Early exit if no disjoint bits found.
if (DisjointBits.isZero())
- return APInt::getZero(BitWidth);
+ return {APInt::getZero(BitWidth), APInt::getZero(BitWidth)};
// Compute the remaining non-disjoint bits that stay in the XOR.
const APInt NonDisjointBits = ConstantValue & ~DisjointBits;
@@ -875,7 +870,7 @@ APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
// This will replace the original constant in the XOR with the new
// constant.
UserChain.push_back(ConstantInt::get(XorInst->getType(), NonDisjointBits));
- return DisjointBits;
+ return {DisjointBits, NonDisjointBits};
}
/// A helper function to check if reassociating through an entry in the user
>From a3430f7bcd51cbf79e5f8295b578d1676a34437e Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pnirpal at qti.qualcomm.com>
Date: Thu, 29 Jan 2026 19:56:38 +0530
Subject: [PATCH 10/10] keep the nextinchain zero for sub simplify
---
llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index c91d3bf2abecf..c84e583362b9b 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -781,8 +781,10 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
// If NextInChain is 0 and not the LHS of a sub, we can simplify the
// sub-expression to be just TheOther.
- if (!(BO->getOpcode() == Instruction::Sub && OpNo == 0))
- return TheOther;
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain))
+ if (CI->isZero())
+ if (!(BO->getOpcode() == Instruction::Sub && OpNo == 0))
+ return TheOther;
BinaryOperator::BinaryOps NewOp = BO->getOpcode();
if (BO->getOpcode() == Instruction::Or) {
More information about the llvm-commits
mailing list