[llvm] [SeparateConstOffsetFromGEP] - Fix the simplification of GEP index when its operand is an XOR constant. (PR #175724)

Thu Jan 29 06:26:58 PST 2026

https://github.com/pawan-nirpal-031 updated https://github.com/llvm/llvm-project/pull/175724

>From ce6347d4e088fdc2941e1a2e267cf04b75d6276b Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pnirpal at qti.qualcomm.com>
Date: Tue, 16 Dec 2025 11:07:23 +0530
Subject: [PATCH 01/10] [AArch64] - Allow for aggressive unrolling, with
 non-zero LoopMicroOpBufferSize for Oryon

---
 llvm/lib/Target/AArch64/AArch64SchedOryon.td  |   2 +-
 .../aarch64-mcpu-oryon-runtime-unroll.ll      | 152 ++++++++++++++++++
 2 files changed, 153 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AArch64/aarch64-mcpu-oryon-runtime-unroll.ll

diff --git a/llvm/lib/Target/AArch64/AArch64SchedOryon.td b/llvm/lib/Target/AArch64/AArch64SchedOryon.td
index 5b597b91e7459..435eaf99c6175 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedOryon.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedOryon.td
@@ -19,7 +19,7 @@ def OryonModel : SchedMachineModel {
   let MicroOpBufferSize     = 376;
   let LoadLatency           =   4;
   let MispredictPenalty     =  13; // 13 cycles for mispredicted branch.
-  let LoopMicroOpBufferSize =   0; // Do not have a LoopMicroOpBuffer
+  let LoopMicroOpBufferSize =   16; // Oryon-1 does not have loop micro op buffer, we enable this pseudo value to allow for aggressive unrolling based on runtime TC.
   let PostRAScheduler       =   1; // Using PostRA sched.
   let CompleteModel         =   1;
 
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mcpu-oryon-runtime-unroll.ll b/llvm/test/CodeGen/AArch64/aarch64-mcpu-oryon-runtime-unroll.ll
new file mode 100644
index 0000000000000..79136cf71c005
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-mcpu-oryon-runtime-unroll.ll
@@ -0,0 +1,152 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes='loop-unroll' -S %s | FileCheck %s --check-prefix=UNROLLED
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @foo(ptr %mat, ptr %sharr, ptr %barr, i16 %rows, i16 %dimout) #0 {
+; UNROLLED-LABEL: define void @foo(
+; UNROLLED-SAME: ptr [[MAT:%.*]], ptr [[SHARR:%.*]], ptr [[BARR:%.*]], i16 [[ROWS:%.*]], i16 [[DIMOUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; UNROLLED-NEXT:  [[ENTRY:.*:]]
+; UNROLLED-NEXT:    [[CMP33:%.*]] = icmp sgt i16 [[DIMOUT]], 0
+; UNROLLED-NEXT:    br i1 [[CMP33]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_END22:.*]]
+; UNROLLED:       [[FOR_BODY_LR_PH]]:
+; UNROLLED-NEXT:    [[CMP631:%.*]] = icmp sgt i16 [[ROWS]], 0
+; UNROLLED-NEXT:    br i1 [[CMP631]], label %[[FOR_BODY_US_PREHEADER:.*]], label %[[FOR_BODY_LR_PH_SPLIT:.*]]
+; UNROLLED:       [[FOR_BODY_US_PREHEADER]]:
+; UNROLLED-NEXT:    [[WIDE_TRIP_COUNT39:%.*]] = zext nneg i16 [[DIMOUT]] to i64
+; UNROLLED-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i16 [[ROWS]] to i64
+; UNROLLED-NEXT:    [[TMP0:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1
+; UNROLLED-NEXT:    br label %[[FOR_BODY_US:.*]]
+; UNROLLED:       [[FOR_BODY_US]]:
+; UNROLLED-NEXT:    [[INDVARS_IV36:%.*]] = phi i64 [ 0, %[[FOR_BODY_US_PREHEADER]] ], [ [[INDVARS_IV_NEXT37:%.*]], %[[FOR_COND3_FOR_INC20_CRIT_EDGE_US:.*]] ]
+; UNROLLED-NEXT:    store i8 0, ptr [[BARR]], align 1
+; UNROLLED-NEXT:    [[INVARIANT_GEP_US:%.*]] = getelementptr i8, ptr [[MAT]], i64 [[INDVARS_IV36]]
+; UNROLLED-NEXT:    [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 1
+; UNROLLED-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
+; UNROLLED-NEXT:    br i1 [[TMP1]], label %[[FOR_BODY8_US_EPIL_PREHEADER:.*]], label %[[FOR_BODY_US_NEW:.*]]
+; UNROLLED:       [[FOR_BODY_US_NEW]]:
+; UNROLLED-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[XTRAITER]]
+; UNROLLED-NEXT:    br label %[[FOR_BODY8_US:.*]]
+; UNROLLED:       [[FOR_BODY8_US]]:
+; UNROLLED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_US_NEW]] ], [ [[INDVARS_IV_NEXT_1:%.*]], %[[FOR_INC_US_1:.*]] ]
+; UNROLLED-NEXT:    [[TMP2:%.*]] = phi i8 [ 0, %[[FOR_BODY_US_NEW]] ], [ [[TMP8:%.*]], %[[FOR_INC_US_1]] ]
+; UNROLLED-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_US_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[FOR_INC_US_1]] ]
+; UNROLLED-NEXT:    [[GEP_US:%.*]] = getelementptr [2 x i8], ptr [[INVARIANT_GEP_US]], i64 [[INDVARS_IV]]
+; UNROLLED-NEXT:    [[TMP3:%.*]] = load i8, ptr [[GEP_US]], align 1
+; UNROLLED-NEXT:    [[TOBOOL_NOT_US:%.*]] = icmp eq i8 [[TMP3]], 0
+; UNROLLED-NEXT:    br i1 [[TOBOOL_NOT_US]], label %[[FOR_INC_US:.*]], label %[[IF_THEN_US:.*]]
+; UNROLLED:       [[IF_THEN_US]]:
+; UNROLLED-NEXT:    [[ARRAYIDX14_US:%.*]] = getelementptr inbounds nuw i8, ptr [[SHARR]], i64 [[INDVARS_IV]]
+; UNROLLED-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX14_US]], align 1
+; UNROLLED-NEXT:    [[XOR30_US:%.*]] = xor i8 [[TMP2]], [[TMP4]]
+; UNROLLED-NEXT:    store i8 [[XOR30_US]], ptr [[BARR]], align 1
+; UNROLLED-NEXT:    br label %[[FOR_INC_US]]
+; UNROLLED:       [[FOR_INC_US]]:
+; UNROLLED-NEXT:    [[TMP5:%.*]] = phi i8 [ [[TMP2]], %[[FOR_BODY8_US]] ], [ [[XOR30_US]], %[[IF_THEN_US]] ]
+; UNROLLED-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; UNROLLED-NEXT:    [[GEP_US_1:%.*]] = getelementptr [2 x i8], ptr [[INVARIANT_GEP_US]], i64 [[INDVARS_IV_NEXT]]
+; UNROLLED-NEXT:    [[TMP6:%.*]] = load i8, ptr [[GEP_US_1]], align 1
+; UNROLLED-NEXT:    [[TOBOOL_NOT_US_1:%.*]] = icmp eq i8 [[TMP6]], 0
+; UNROLLED-NEXT:    br i1 [[TOBOOL_NOT_US_1]], label %[[FOR_INC_US_1]], label %[[IF_THEN_US_1:.*]]
+; UNROLLED:       [[IF_THEN_US_1]]:
+; UNROLLED-NEXT:    [[ARRAYIDX14_US_1:%.*]] = getelementptr inbounds nuw i8, ptr [[SHARR]], i64 [[INDVARS_IV_NEXT]]
+; UNROLLED-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX14_US_1]], align 1
+; UNROLLED-NEXT:    [[XOR30_US_1:%.*]] = xor i8 [[TMP5]], [[TMP7]]
+; UNROLLED-NEXT:    store i8 [[XOR30_US_1]], ptr [[BARR]], align 1
+; UNROLLED-NEXT:    br label %[[FOR_INC_US_1]]
+; UNROLLED:       [[FOR_INC_US_1]]:
+; UNROLLED-NEXT:    [[TMP8]] = phi i8 [ [[TMP5]], %[[FOR_INC_US]] ], [ [[XOR30_US_1]], %[[IF_THEN_US_1]] ]
+; UNROLLED-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; UNROLLED-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
+; UNROLLED-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; UNROLLED-NEXT:    br i1 [[NITER_NCMP_1]], label %[[FOR_COND3_FOR_INC20_CRIT_EDGE_US_UNR_LCSSA:.*]], label %[[FOR_BODY8_US]]
+; UNROLLED:       [[FOR_COND3_FOR_INC20_CRIT_EDGE_US_UNR_LCSSA]]:
+; UNROLLED-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1]], %[[FOR_INC_US_1]] ]
+; UNROLLED-NEXT:    [[DOTUNR:%.*]] = phi i8 [ [[TMP8]], %[[FOR_INC_US_1]] ]
+; UNROLLED-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; UNROLLED-NEXT:    br i1 [[LCMP_MOD]], label %[[FOR_BODY8_US_EPIL_PREHEADER]], label %[[FOR_COND3_FOR_INC20_CRIT_EDGE_US]]
+; UNROLLED:       [[FOR_BODY8_US_EPIL_PREHEADER]]:
+; UNROLLED-NEXT:    [[INDVARS_IV_EPIL_INIT:%.*]] = phi i64 [ 0, %[[FOR_BODY_US]] ], [ [[INDVARS_IV_UNR]], %[[FOR_COND3_FOR_INC20_CRIT_EDGE_US_UNR_LCSSA]] ]
+; UNROLLED-NEXT:    [[DOTEPIL_INIT:%.*]] = phi i8 [ 0, %[[FOR_BODY_US]] ], [ [[DOTUNR]], %[[FOR_COND3_FOR_INC20_CRIT_EDGE_US_UNR_LCSSA]] ]
+; UNROLLED-NEXT:    [[LCMP_MOD1:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; UNROLLED-NEXT:    call void @llvm.assume(i1 [[LCMP_MOD1]])
+; UNROLLED-NEXT:    br label %[[FOR_BODY8_US_EPIL:.*]]
+; UNROLLED:       [[FOR_BODY8_US_EPIL]]:
+; UNROLLED-NEXT:    [[GEP_US_EPIL:%.*]] = getelementptr [2 x i8], ptr [[INVARIANT_GEP_US]], i64 [[INDVARS_IV_EPIL_INIT]]
+; UNROLLED-NEXT:    [[TMP9:%.*]] = load i8, ptr [[GEP_US_EPIL]], align 1
+; UNROLLED-NEXT:    [[TOBOOL_NOT_US_EPIL:%.*]] = icmp eq i8 [[TMP9]], 0
+; UNROLLED-NEXT:    br i1 [[TOBOOL_NOT_US_EPIL]], label %[[FOR_INC_US_EPIL:.*]], label %[[IF_THEN_US_EPIL:.*]]
+; UNROLLED:       [[IF_THEN_US_EPIL]]:
+; UNROLLED-NEXT:    [[ARRAYIDX14_US_EPIL:%.*]] = getelementptr inbounds nuw i8, ptr [[SHARR]], i64 [[INDVARS_IV_EPIL_INIT]]
+; UNROLLED-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX14_US_EPIL]], align 1
+; UNROLLED-NEXT:    [[XOR30_US_EPIL:%.*]] = xor i8 [[DOTEPIL_INIT]], [[TMP10]]
+; UNROLLED-NEXT:    store i8 [[XOR30_US_EPIL]], ptr [[BARR]], align 1
+; UNROLLED-NEXT:    br label %[[FOR_INC_US_EPIL]]
+; UNROLLED:       [[FOR_INC_US_EPIL]]:
+; UNROLLED-NEXT:    br label %[[FOR_COND3_FOR_INC20_CRIT_EDGE_US]]
+; UNROLLED:       [[FOR_COND3_FOR_INC20_CRIT_EDGE_US]]:
+; UNROLLED-NEXT:    [[INDVARS_IV_NEXT37]] = add nuw nsw i64 [[INDVARS_IV36]], 1
+; UNROLLED-NEXT:    [[EXITCOND40_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT37]], [[WIDE_TRIP_COUNT39]]
+; UNROLLED-NEXT:    br i1 [[EXITCOND40_NOT]], label %[[FOR_END22_LOOPEXIT:.*]], label %[[FOR_BODY_US]]
+; UNROLLED:       [[FOR_BODY_LR_PH_SPLIT]]:
+; UNROLLED-NEXT:    store i8 0, ptr [[BARR]], align 1
+; UNROLLED-NEXT:    br label %[[FOR_END22]]
+; UNROLLED:       [[FOR_END22_LOOPEXIT]]:
+; UNROLLED-NEXT:    br label %[[FOR_END22]]
+; UNROLLED:       [[FOR_END22]]:
+; UNROLLED-NEXT:    ret void
+;
+entry:
+  %cmp33 = icmp sgt i16 %dimout, 0
+  br i1 %cmp33, label %for.body.lr.ph, label %for.end22
+
+for.body.lr.ph:                                   ; preds = %entry
+  %cmp631 = icmp sgt i16 %rows, 0
+  br i1 %cmp631, label %for.body.us.preheader, label %for.body.lr.ph.split
+
+for.body.us.preheader:                            ; preds = %for.body.lr.ph
+  %wide.trip.count39 = zext nneg i16 %dimout to i64
+  %wide.trip.count = zext nneg i16 %rows to i64
+  br label %for.body.us
+
+for.body.us:                                      ; preds = %for.body.us.preheader, %for.cond3.for.inc20_crit_edge.us
+  %indvars.iv36 = phi i64 [ 0, %for.body.us.preheader ], [ %indvars.iv.next37, %for.cond3.for.inc20_crit_edge.us ]
+  store i8 0, ptr %barr, align 1
+  %invariant.gep.us = getelementptr i8, ptr %mat, i64 %indvars.iv36
+  br label %for.body8.us
+
+for.body8.us:                                     ; preds = %for.body.us, %for.inc.us
+  %indvars.iv = phi i64 [ 0, %for.body.us ], [ %indvars.iv.next, %for.inc.us ]
+  %0 = phi i8 [ 0, %for.body.us ], [ %3, %for.inc.us ]
+  %gep.us = getelementptr [2 x i8], ptr %invariant.gep.us, i64 %indvars.iv
+  %1 = load i8, ptr %gep.us, align 1
+  %tobool.not.us = icmp eq i8 %1, 0
+  br i1 %tobool.not.us, label %for.inc.us, label %if.then.us
+
+if.then.us:                                       ; preds = %for.body8.us
+  %arrayidx14.us = getelementptr inbounds nuw i8, ptr %sharr, i64 %indvars.iv
+  %2 = load i8, ptr %arrayidx14.us, align 1
+  %xor30.us = xor i8 %0, %2
+  store i8 %xor30.us, ptr %barr, align 1
+  br label %for.inc.us
+
+for.inc.us:                                       ; preds = %if.then.us, %for.body8.us
+  %3 = phi i8 [ %0, %for.body8.us ], [ %xor30.us, %if.then.us ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond3.for.inc20_crit_edge.us, label %for.body8.us
+
+for.cond3.for.inc20_crit_edge.us:                 ; preds = %for.inc.us
+  %indvars.iv.next37 = add nuw nsw i64 %indvars.iv36, 1
+  %exitcond40.not = icmp eq i64 %indvars.iv.next37, %wide.trip.count39
+  br i1 %exitcond40.not, label %for.end22, label %for.body.us
+
+for.body.lr.ph.split:                             ; preds = %for.body.lr.ph
+  store i8 0, ptr %barr, align 1
+  br label %for.end22
+
+for.end22:                                        ; preds = %for.cond3.for.inc20_crit_edge.us, %for.body.lr.ph.split, %entry
+  ret void
+}
+
+attributes #0 = { "target-cpu"="oryon-1" "target-features"="+neon,+sve" }

>From 905a2847d0e80377dc8a5a5813057a148d7c04a6 Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pnirpal at qti.qualcomm.com>
Date: Tue, 13 Jan 2026 14:15:36 +0530
Subject: [PATCH 02/10] [SeparateConstOffsetFromGEP] Restrict simplification of
 XOR index when its base operand is constant

---
 .../Scalar/SeparateConstOffsetFromGEP.cpp       |  7 ++++++-
 .../fix-const-xor-gep-simplify.ll               | 17 +++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index dc47b243625b8..d1ea825e33409 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -796,7 +796,12 @@ APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
   ConstantInt *XorConstant;
 
   // Match pattern: xor BaseOperand, Constant.
-  if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant))))
+  // Restricting the BaseOperand to a compile time unknown.
+  // TODO : Formally prove the safety of BaseOperand to be compile time constant
+  // and extend the optimization for those cases.
+  if (!match(XorInst,
+             m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant))) ||
+      (isa<Constant>(BaseOperand)))
     return APInt::getZero(BitWidth);
 
   // Compute known bits for the base operand.
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
new file mode 100644
index 0000000000000..042d0aa03ed8a
--- /dev/null
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
@@ -0,0 +1,17 @@
+; RUN: opt -S -passes='separate-const-offset-from-gep' %s | FileCheck %s
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-gnu"
+
+define ptr @src(i32 %0) {
+; CHECK-LABEL: @src(
+; CHECK-NEXT: %base = alloca [4 x i32], align 16
+; CHECK-NEXT: %2 = xor i64 0, 3
+; CHECK-NEXT: %gep = getelementptr [4 x i32], ptr %base, i64 0, i64 %2
+; CHECK-NEXT: ret ptr %gep
+
+; CHECK-NOT: getelementptr i8, ptr %gep, i64 12
+  %base = alloca [4 x i32], align 16
+  %2 = xor i64 0, 3
+  %gep = getelementptr [4 x i32], ptr %base, i64 0, i64 %2
+  ret ptr %gep
+}

>From d43e9937f6b6cc8b7fca9f6cff2cfc578a55569d Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pawannirpal at gmail.com>
Date: Tue, 13 Jan 2026 14:29:41 +0530
Subject: [PATCH 03/10] Apply suggestion from @arsenm

Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
---
 .../SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
index 042d0aa03ed8a..e923072174bea 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -passes='separate-const-offset-from-gep' %s | FileCheck %s
+; RUN: opt -S -passes=separate-const-offset-from-gep %s | FileCheck %s
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
 target triple = "aarch64-unknown-linux-gnu"
 

>From c5f2b69261afe193b415e2a7b007aac918da700c Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pawannirpal at gmail.com>
Date: Tue, 13 Jan 2026 14:30:12 +0530
Subject: [PATCH 04/10] Apply suggestion from @arsenm

Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
---
 .../SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll    | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
index e923072174bea..ce1174b700d24 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
@@ -1,6 +1,4 @@
 ; RUN: opt -S -passes=separate-const-offset-from-gep %s | FileCheck %s
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
-target triple = "aarch64-unknown-linux-gnu"
 
 define ptr @src(i32 %0) {
 ; CHECK-LABEL: @src(

>From caebb2e14495b8fd8e78d0facfa4640d25e0e460 Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pnirpal at qti.qualcomm.com>
Date: Tue, 13 Jan 2026 14:41:47 +0530
Subject: [PATCH 05/10] address comments 1

---
 .../Scalar/SeparateConstOffsetFromGEP.cpp     |  2 +-
 .../fix-const-xor-gep-simplify.ll             | 19 ++++++++++---------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index d1ea825e33409..2a1385c26763c 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -801,7 +801,7 @@ APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
   // and extend the optimization for those cases.
   if (!match(XorInst,
              m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant))) ||
-      (isa<Constant>(BaseOperand)))
+      (match(BaseOperand, m_ConstantInt())))
     return APInt::getZero(BitWidth);
 
   // Compute known bits for the base operand.
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
index ce1174b700d24..50b79aac72c5f 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
@@ -1,15 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -passes=separate-const-offset-from-gep %s | FileCheck %s
 
 define ptr @src(i32 %0) {
-; CHECK-LABEL: @src(
-; CHECK-NEXT: %base = alloca [4 x i32], align 16
-; CHECK-NEXT: %2 = xor i64 0, 3
-; CHECK-NEXT: %gep = getelementptr [4 x i32], ptr %base, i64 0, i64 %2
-; CHECK-NEXT: ret ptr %gep
-
-; CHECK-NOT: getelementptr i8, ptr %gep, i64 12
+; CHECK-LABEL: define ptr @src(
+; CHECK-SAME: i32 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[BASE:%.*]] = alloca [4 x i32], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 0, 3
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [4 x i32], ptr [[BASE]], i64 0, i64 [[TMP2]]
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
   %base = alloca [4 x i32], align 16
-  %2 = xor i64 0, 3
-  %gep = getelementptr [4 x i32], ptr %base, i64 0, i64 %2
+  %xor = xor i64 0, 3
+  %gep = getelementptr [4 x i32], ptr %base, i64 0, i64 %xor
   ret ptr %gep
 }

>From 57e3c81cd8e63bc6ebac9be6647e477375941844 Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pnirpal at qti.qualcomm.com>
Date: Tue, 13 Jan 2026 21:51:54 +0530
Subject: [PATCH 06/10] refactor logic

---
 .../Scalar/SeparateConstOffsetFromGEP.cpp     | 17 ++++++-----
 .../fix-const-xor-gep-simplify.ll             | 30 +++++++++++++++----
 2 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 2a1385c26763c..10cc056eec0cc 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -729,8 +729,16 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
       // with non-disjoint constant bits.
       // TODO: The design should be updated to support partial constant
       // extraction.
-      if (BO->getOpcode() == Instruction::Xor)
+      if (BO->getOpcode() == Instruction::Xor) {
+        if (auto *ConstIOther = dyn_cast<ConstantInt>(TheOther)) {
+          const APInt &DisjointBits = extractDisjointBitsFromXor(BO);
+          const APInt &ConstantValue = ConstIOther->getValue();
+          const APInt &NonDisjointBits = ConstantValue & (~DisjointBits);
+          BO->setOperand(1 - OpNo,
+                         ConstantInt::get(BO->getType(), NonDisjointBits));
+        }
         return BO;
+      }
 
       // If NextInChain is 0 and not the LHS of a sub, we can simplify the
       // sub-expression to be just TheOther.
@@ -796,12 +804,7 @@ APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
   ConstantInt *XorConstant;
 
   // Match pattern: xor BaseOperand, Constant.
-  // Restricting the BaseOperand to a compile time unknown.
-  // TODO : Formally prove the safety of BaseOperand to be compile time constant
-  // and extend the optimization for those cases.
-  if (!match(XorInst,
-             m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant))) ||
-      (match(BaseOperand, m_ConstantInt())))
+  if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant))))
     return APInt::getZero(BitWidth);
 
   // Compute known bits for the base operand.
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
index 50b79aac72c5f..049463b33ac71 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/fix-const-xor-gep-simplify.ll
@@ -1,16 +1,34 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -S -passes=separate-const-offset-from-gep %s | FileCheck %s
+; RUN: opt -S  -passes='separate-const-offset-from-gep<lower-gep>' %s | FileCheck %s
 
-define ptr @src(i32 %0) {
-; CHECK-LABEL: define ptr @src(
+define ptr @xor-const-test(i32 %0) {
+; CHECK-LABEL: define ptr @xor-const-test(
 ; CHECK-SAME: i32 [[TMP0:%.*]]) {
 ; CHECK-NEXT:    [[BASE:%.*]] = alloca [4 x i32], align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 0, 3
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [4 x i32], ptr [[BASE]], i64 0, i64 [[TMP2]]
-; CHECK-NEXT:    ret ptr [[GEP]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i64 0, 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[XOR1]], 2
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[UGLYGEP]], i64 12
+; CHECK-NEXT:    ret ptr [[TMP2]]
 ;
   %base = alloca [4 x i32], align 16
   %xor = xor i64 0, 3
   %gep = getelementptr [4 x i32], ptr %base, i64 0, i64 %xor
   ret ptr %gep
 }
+
+define ptr @xor-const-test2(i32 %0) {
+; CHECK-LABEL: define ptr @xor-const-test2(
+; CHECK-SAME: i32 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[BASE:%.*]] = alloca [4 x i32], align 16
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i64 2, 2
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[XOR1]], 2
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[TMP2]]
+; CHECK-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[UGLYGEP]], i64 4
+; CHECK-NEXT:    ret ptr [[UGLYGEP2]]
+;
+  %base = alloca [4 x i32], align 16
+  %xor = xor i64 2, 3
+  %gep = getelementptr [4 x i32], ptr %base, i64 0, i64 %xor
+  ret ptr %gep
+}

>From 5964193743ed792a02bace52d14123aa730e2a96 Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pnirpal at qti.qualcomm.com>
Date: Wed, 21 Jan 2026 13:26:23 +0530
Subject: [PATCH 07/10] Add explainatory comments

---
 .../Scalar/SeparateConstOffsetFromGEP.cpp     | 43 ++++++++++++++++++-
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 10cc056eec0cc..172f47fa852f7 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -725,12 +725,51 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
 
   if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
     if (CI->isZero()) {
-      // Custom XOR handling for disjoint bits - preserves original XOR
-      // with non-disjoint constant bits.
+      // Custom XOR handling for disjoint bits.
       // TODO: The design should be updated to support partial constant
       // extraction.
       if (BO->getOpcode() == Instruction::Xor) {
         if (auto *ConstIOther = dyn_cast<ConstantInt>(TheOther)) {
+          // Purpose:
+          //   We want to compute/simplify the expression:
+          //       b + scale * (a ^ c)
+          //   Here a and c are both constants.
+
+          // Transform:
+          //   We must partition c into disjoint and non-disjoint components and
+          //   only XOR the non-disjoint bits with a:
+
+          //       non_disjoint(c) = c & ~disjoint(c)
+
+          //   Therefore the correct form is:
+          //       b + ((a ^ non_disjoint(c)) + disjoint(c)) * scale
+
+          // Rationale:
+          //   - Bits of c that are disjoint from a (i.e., where a is known
+          //   zero)
+          // pass through unchanged (added, not XORed).
+          //   - Only the overlapping (non-disjoint) bits of c should
+          //   participate
+          // in the XOR with a.
+
+          // Example:
+          //   a = 0
+          //   c = 3
+          //   scale = 4
+
+          //   Expected:
+          //     b + scale * (a ^ c)
+          //     = b + 4 * (0 ^ 3)
+          //     = b + 4 * 3
+          //     = b + 12
+
+          //   Transform:
+          //     non_disjoint(3) = 3 & ~3 = 0
+          //     b + ((0 ^ non_disjoint(3)) + disjoint(3)) * 4
+          //     = b + ((0 ^ 0) + 3) * 4
+          //     = b + 3 * 4
+          //     = b + 12
+
           const APInt &DisjointBits = extractDisjointBitsFromXor(BO);
           const APInt &ConstantValue = ConstIOther->getValue();
           const APInt &NonDisjointBits = ConstantValue & (~DisjointBits);

>From 60cdac52f7143ff0f14b06fe2004dc92568f20ca Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pnirpal at qti.qualcomm.com>
Date: Wed, 21 Jan 2026 13:29:55 +0530
Subject: [PATCH 08/10] comment

---
 llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 172f47fa852f7..d5a951145b949 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -744,6 +744,9 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
           //   Therefore the correct form is:
           //       b + ((a ^ non_disjoint(c)) + disjoint(c)) * scale
 
+          // And not b + ((a ^ (c)) + disjoint(c)) * scale, which ignores the
+          // non_disjoint split of the constant c which is incorrect.
+
           // Rationale:
           //   - Bits of c that are disjoint from a (i.e., where a is known
           //   zero)

>From c06cf41488b650a29e7433b406b49ee798d61b2f Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pnirpal at qti.qualcomm.com>
Date: Thu, 29 Jan 2026 19:46:07 +0530
Subject: [PATCH 09/10] Address comments

---
 .../Scalar/SeparateConstOffsetFromGEP.cpp     | 137 +++++++++---------
 1 file changed, 66 insertions(+), 71 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index d5a951145b949..c91d3bf2abecf 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -297,7 +297,7 @@ class ConstantOffsetExtractor {
 
   /// Analyze XOR instruction to extract disjoint constant bits that behave
   /// like addition operations for improved address mode folding.
-  APInt extractDisjointBitsFromXor(BinaryOperator *XorInst);
+  std::pair<APInt, APInt> extractDisjointBitsFromXor(BinaryOperator *XorInst);
 
   /// The path from the constant offset to the old GEP index. e.g., if the GEP
   /// index is "a * b + (c + 5)". After running function find, UserChain[0] will
@@ -602,8 +602,10 @@ APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
     if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative))
       ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
     // Handle XOR with disjoint bits that can be treated as addition.
-    else if (BO->getOpcode() == Instruction::Xor)
-      ConstantOffset = extractDisjointBitsFromXor(BO);
+    else if (BO->getOpcode() == Instruction::Xor) {
+      auto [DisjointBits, NonDisjointBits] = extractDisjointBitsFromXor(BO);
+      ConstantOffset = DisjointBits;
+    }
   } else if (isa<TruncInst>(V)) {
     ConstantOffset =
         find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative)
@@ -723,72 +725,65 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
   Value *NextInChain = removeConstOffset(ChainIndex - 1);
   Value *TheOther = BO->getOperand(1 - OpNo);
 
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
-    if (CI->isZero()) {
-      // Custom XOR handling for disjoint bits.
-      // TODO: The design should be updated to support partial constant
-      // extraction.
-      if (BO->getOpcode() == Instruction::Xor) {
-        if (auto *ConstIOther = dyn_cast<ConstantInt>(TheOther)) {
-          // Purpose:
-          //   We want to compute/simplify the expression:
-          //       b + scale * (a ^ c)
-          //   Here a and c are both constants.
-
-          // Transform:
-          //   We must partition c into disjoint and non-disjoint components and
-          //   only XOR the non-disjoint bits with a:
-
-          //       non_disjoint(c) = c & ~disjoint(c)
-
-          //   Therefore the correct form is:
-          //       b + ((a ^ non_disjoint(c)) + disjoint(c)) * scale
-
-          // And not b + ((a ^ (c)) + disjoint(c)) * scale, which ignores the
-          // non_disjoint split of the constant c which is incorrect.
-
-          // Rationale:
-          //   - Bits of c that are disjoint from a (i.e., where a is known
-          //   zero)
-          // pass through unchanged (added, not XORed).
-          //   - Only the overlapping (non-disjoint) bits of c should
-          //   participate
-          // in the XOR with a.
-
-          // Example:
-          //   a = 0
-          //   c = 3
-          //   scale = 4
-
-          //   Expected:
-          //     b + scale * (a ^ c)
-          //     = b + 4 * (0 ^ 3)
-          //     = b + 4 * 3
-          //     = b + 12
-
-          //   Transform:
-          //     non_disjoint(3) = 3 & ~3 = 0
-          //     b + ((0 ^ non_disjoint(3)) + disjoint(3)) * 4
-          //     = b + ((0 ^ 0) + 3) * 4
-          //     = b + 3 * 4
-          //     = b + 12
-
-          const APInt &DisjointBits = extractDisjointBitsFromXor(BO);
-          const APInt &ConstantValue = ConstIOther->getValue();
-          const APInt &NonDisjointBits = ConstantValue & (~DisjointBits);
-          BO->setOperand(1 - OpNo,
-                         ConstantInt::get(BO->getType(), NonDisjointBits));
-        }
-        return BO;
-      }
-
-      // If NextInChain is 0 and not the LHS of a sub, we can simplify the
-      // sub-expression to be just TheOther.
-      if (!(BO->getOpcode() == Instruction::Sub && OpNo == 0))
-        return TheOther;
+  // Custom XOR handling for disjoint bits.
+  // TODO: The design should be updated to support partial constant
+  // extraction.
+  if (BO->getOpcode() == Instruction::Xor) {
+    if (dyn_cast<ConstantInt>(TheOther)) {
+      // Purpose:
+      //   We want to compute/simplify the expression:
+      //       b + scale * (a ^ c)
+      //   Here a and c are both constants.
+
+      // Transform:
+      //   We must partition c into disjoint and non-disjoint components and
+      //   only XOR the non-disjoint bits with a:
+
+      //       non_disjoint(c) = c & ~disjoint(c)
+
+      //   Therefore the correct form is:
+      //       b + ((a ^ non_disjoint(c)) + disjoint(c)) * scale
+
+      // And not b + ((a ^ (c)) + disjoint(c)) * scale, which ignores the
+      // non_disjoint split of the constant c which is incorrect.
+
+      // Rationale:
+      //   - Bits of c that are disjoint from a (i.e., where a is known
+      //   zero)
+      // pass through unchanged (added, not XORed).
+      //   - Only the overlapping (non-disjoint) bits of c should
+      //   participate in the XOR with a.
+
+      // Example:
+      //   a = 0
+      //   c = 3
+      //   scale = 4
+
+      //   Expected:
+      //     b + scale * (a ^ c)
+      //     = b + 4 * (0 ^ 3)
+      //     = b + 4 * 3
+      //     = b + 12
+
+      //   Transform:
+      //     non_disjoint(3) = 3 & ~3 = 0
+      //     b + ((0 ^ non_disjoint(3)) + disjoint(3)) * 4
+      //     = b + ((0 ^ 0) + 3) * 4
+      //     = b + 3 * 4
+      //     = b + 12
+
+      auto [DisjointBits, NonDisjointBits] = extractDisjointBitsFromXor(BO);
+      BO->setOperand(1 - OpNo,
+                     ConstantInt::get(BO->getType(), NonDisjointBits));
     }
+    return BO;
   }
 
+  // If NextInChain is 0 and not the LHS of a sub, we can simplify the
+  // sub-expression to be just TheOther.
+  if (!(BO->getOpcode() == Instruction::Sub && OpNo == 0))
+    return TheOther;
+
   BinaryOperator::BinaryOps NewOp = BO->getOpcode();
   if (BO->getOpcode() == Instruction::Or) {
     // Rebuild "or" as "add", because "or" may be invalid for the new
@@ -836,8 +831,8 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
 /// \param XorInst The XOR binary operator to analyze
 /// \return APInt containing the disjoint bits that can be extracted as offset,
 ///         or zero if no disjoint bits exist
-APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
-    BinaryOperator *XorInst) {
+std::pair<APInt, APInt>
+ConstantOffsetExtractor::extractDisjointBitsFromXor(BinaryOperator *XorInst) {
   assert(XorInst && XorInst->getOpcode() == Instruction::Xor &&
          "Expected XOR instruction");
 
@@ -847,7 +842,7 @@ APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
 
   // Match pattern: xor BaseOperand, Constant.
   if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant))))
-    return APInt::getZero(BitWidth);
+    return {APInt::getZero(BitWidth), APInt::getZero(BitWidth)};
 
   // Compute known bits for the base operand.
   const SimplifyQuery SQ(DL);
@@ -859,7 +854,7 @@ APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
 
   // Early exit if no disjoint bits found.
   if (DisjointBits.isZero())
-    return APInt::getZero(BitWidth);
+    return {APInt::getZero(BitWidth), APInt::getZero(BitWidth)};
 
   // Compute the remaining non-disjoint bits that stay in the XOR.
   const APInt NonDisjointBits = ConstantValue & ~DisjointBits;
@@ -875,7 +870,7 @@ APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
   // This will replace the original constant in the XOR with the new
   // constant.
   UserChain.push_back(ConstantInt::get(XorInst->getType(), NonDisjointBits));
-  return DisjointBits;
+  return {DisjointBits, NonDisjointBits};
 }
 
 /// A helper function to check if reassociating through an entry in the user

>From a3430f7bcd51cbf79e5f8295b578d1676a34437e Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pnirpal at qti.qualcomm.com>
Date: Thu, 29 Jan 2026 19:56:38 +0530
Subject: [PATCH 10/10] keep the nextinchain zero for sub simplify

---
 llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index c91d3bf2abecf..c84e583362b9b 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -781,8 +781,10 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
 
   // If NextInChain is 0 and not the LHS of a sub, we can simplify the
   // sub-expression to be just TheOther.
-  if (!(BO->getOpcode() == Instruction::Sub && OpNo == 0))
-    return TheOther;
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain))
+    if (CI->isZero())
+      if (!(BO->getOpcode() == Instruction::Sub && OpNo == 0))
+        return TheOther;
 
   BinaryOperator::BinaryOps NewOp = BO->getOpcode();
   if (BO->getOpcode() == Instruction::Or) {