[llvm] [SeparateConstOffsetFromGEP] Decompose constant xor operand if possible (PR #150438)
Sumanth Gundapaneni via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 5 07:46:34 PDT 2025
https://github.com/sgundapa updated https://github.com/llvm/llvm-project/pull/150438
>From 72844323dc672f2d077f169c0f3856e8f2401d96 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sugundap at amd.com>
Date: Wed, 23 Jul 2025 11:58:12 -0500
Subject: [PATCH 01/10] [SeparateConstOffsetFromGEP] Decompose constant xor
operand if possible
Try to transform XOR(A, B+C) in to XOR(A,C) + B where XOR(A,C) is part
of base for memory operations. This transformation is true under the
following conditions
Check 1 - B and C are disjoint.
Check 2 - XOR(A,C) and B are disjoint.
This transformation can map these Xors in to better addressing mode and
eventually decompose them in to geps.
---
.../Scalar/SeparateConstOffsetFromGEP.cpp | 141 ++++++++++++++++--
.../AMDGPU/xor-idiom.ll | 66 ++++++++
2 files changed, 191 insertions(+), 16 deletions(-)
create mode 100644 llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 320b79203c0b3..203850c28787c 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -238,16 +238,17 @@ class ConstantOffsetExtractor {
/// \p PreservesNUW Outputs whether the extraction allows preserving the
/// GEP's nuw flag, if it has one.
static Value *Extract(Value *Idx, GetElementPtrInst *GEP,
- User *&UserChainTail, bool &PreservesNUW);
+ User *&UserChainTail, bool &PreservesNUW,
+ DominatorTree *DT);
/// Looks for a constant offset from the given GEP index without extracting
/// it. It returns the numeric value of the extracted constant offset (0 if
/// failed). The meaning of the arguments are the same as Extract.
- static int64_t Find(Value *Idx, GetElementPtrInst *GEP);
+ static int64_t Find(Value *Idx, GetElementPtrInst *GEP, DominatorTree *DT);
private:
- ConstantOffsetExtractor(BasicBlock::iterator InsertionPt)
- : IP(InsertionPt), DL(InsertionPt->getDataLayout()) {}
+ ConstantOffsetExtractor(BasicBlock::iterator InsertionPt, DominatorTree *DT)
+ : IP(InsertionPt), DT(DT), DL(InsertionPt->getDataLayout()) {}
/// Searches the expression that computes V for a non-zero constant C s.t.
/// V can be reassociated into the form V' + C. If the searching is
@@ -321,6 +322,20 @@ class ConstantOffsetExtractor {
bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
bool NonNegative);
+ // Find the most dominating Xor with the same base operand.
+ BinaryOperator *findDominatingXor(Value *BaseOperand,
+ BinaryOperator *CurrentXor);
+
+ /// Check if Xor instruction should be considered for optimization.
+ bool shouldConsiderXor(BinaryOperator *XorInst);
+
+ /// Cache the information about Xor idiom.
+ struct XorRewriteInfo {
+ llvm::BinaryOperator *BaseXor = nullptr;
+ int64_t AdjustedOffset = 0;
+ };
+ std::optional<XorRewriteInfo> CachedXorInfo;
+
/// The path from the constant offset to the old GEP index. e.g., if the GEP
/// index is "a * b + (c + 5)". After running function find, UserChain[0] will
/// be the constant 5, UserChain[1] will be the subexpression "c + 5", and
@@ -336,6 +351,8 @@ class ConstantOffsetExtractor {
/// Insertion position of cloned instructions.
BasicBlock::iterator IP;
+ DominatorTree *DT;
+
const DataLayout &DL;
};
@@ -514,12 +531,14 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
bool ZeroExtended,
BinaryOperator *BO,
bool NonNegative) {
- // We only consider ADD, SUB and OR, because a non-zero constant found in
+ // We only consider ADD, SUB, OR and XOR, because a non-zero constant found in
// expressions composed of these operations can be easily hoisted as a
- // constant offset by reassociation.
+ // constant offset by reassociation. XOR is a special case and can be folded
+ // in to gep if the constant is proven to be disjoint.
if (BO->getOpcode() != Instruction::Add &&
BO->getOpcode() != Instruction::Sub &&
- BO->getOpcode() != Instruction::Or) {
+ BO->getOpcode() != Instruction::Or &&
+ BO->getOpcode() != Instruction::Xor) {
return false;
}
@@ -530,6 +549,10 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
!cast<PossiblyDisjointInst>(BO)->isDisjoint())
return false;
+ // Handle Xor idiom.
+ if (BO->getOpcode() == Instruction::Xor)
+ return shouldConsiderXor(BO);
+
// FIXME: We don't currently support constants from the RHS of subs,
// when we are zero-extended, because we need a way to zero-extended
// them before they are negated.
@@ -740,6 +763,10 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
"UserChain, so no one should be used more than "
"once");
+ // Special case for Xor idiom.
+ if (BO->getOpcode() == Instruction::Xor)
+ return CachedXorInfo->BaseXor;
+
unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]);
Value *NextInChain = removeConstOffset(ChainIndex - 1);
@@ -780,6 +807,80 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
return NewBO;
}
+// Find the most dominating Xor with the same base operand.
+BinaryOperator *
+ConstantOffsetExtractor::findDominatingXor(Value *BaseOperand,
+ BinaryOperator *CurrentXor) {
+ BinaryOperator *MostDominatingXor = nullptr;
+ // Iterate over all instructions that use the BaseOperand.
+ for (User *U : BaseOperand->users()) {
+ auto *CandidateXor = dyn_cast<BinaryOperator>(U);
+
+ // Simple checks.
+ if (!CandidateXor || CandidateXor == CurrentXor)
+ continue;
+
+ // Check if the binary operator is a Xor with constant.
+ if (!match(CandidateXor, m_Xor(m_Specific(BaseOperand), m_ConstantInt())))
+ continue;
+
+ // After confirming the structure, check the dominance relationship.
+ if (DT->dominates(CandidateXor, CurrentXor))
+ // If we find a dominating Xor, keep it if it's the first one,
+ // or if it dominates the best candidate we've found so far.
+ if (!MostDominatingXor || DT->dominates(CandidateXor, MostDominatingXor))
+ MostDominatingXor = CandidateXor;
+ }
+
+ return MostDominatingXor;
+}
+
+// Check if Xor should be considered.
+// Only the following idiom is considered.
+// Example:
+// %3 = xor i32 %2, 32
+// %4 = xor i32 %2, 8224
+// %6 = getelementptr half, ptr addrspace(3) %1, i32 %3
+// %7 = getelementptr half, ptr addrspace(3) %1, i32 %4
+// GEP that corresponds to %7, looks at the binary operator %4.
+// In order for %4 to be considered, it should have a dominating xor with
+// constant offset that is disjoint with an adjusted offset.
+// If disjoint, %4 = xor i32 %2, 8224 can be treated as %4 = add i32 %3, 8192
+bool ConstantOffsetExtractor::shouldConsiderXor(BinaryOperator *XorInst) {
+
+ Value *BaseOperand = nullptr;
+ ConstantInt *CurrentConst = nullptr;
+ if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(CurrentConst))))
+ return false;
+
+ // Find the most dominating Xor with the same base operand.
+ BinaryOperator *DominatingXor = findDominatingXor(BaseOperand, XorInst);
+ if (!DominatingXor)
+ return false;
+
+ // We expect the dominating instruction to also be a 'xor-const'.
+ ConstantInt *DominatingConst = nullptr;
+ if (!match(DominatingXor,
+ m_Xor(m_Specific(BaseOperand), m_ConstantInt(DominatingConst))))
+ return false;
+
+ // Calculate the adjusted offset (difference between constants)
+ APInt AdjustedOffset = CurrentConst->getValue() - DominatingConst->getValue();
+
+ // Check disjoint conditions
+ // 1. AdjustedOffset and DominatingConst should be disjoint
+ if ((AdjustedOffset & DominatingConst->getValue()) != 0)
+ return false;
+
+ // 2. DominatingXor and AdjustedOffset should be disjoint
+ if (!MaskedValueIsZero(DominatingXor, AdjustedOffset, SimplifyQuery(DL), 0))
+ return false;
+
+ // Cache the result.
+ CachedXorInfo = XorRewriteInfo{DominatingXor, AdjustedOffset.getSExtValue()};
+ return true;
+}
+
/// A helper function to check if reassociating through an entry in the user
/// chain would invalidate the GEP's nuw flag.
static bool allowsPreservingNUW(const User *U) {
@@ -805,8 +906,8 @@ static bool allowsPreservingNUW(const User *U) {
Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
User *&UserChainTail,
- bool &PreservesNUW) {
- ConstantOffsetExtractor Extractor(GEP->getIterator());
+ bool &PreservesNUW, DominatorTree *DT) {
+ ConstantOffsetExtractor Extractor(GEP->getIterator(), DT);
// Find a non-zero constant offset first.
APInt ConstantOffset =
Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
@@ -825,12 +926,20 @@ Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
return IdxWithoutConstOffset;
}
-int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP) {
+int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP,
+ DominatorTree *DT) {
// If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative.
- return ConstantOffsetExtractor(GEP->getIterator())
- .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
- GEP->isInBounds())
- .getSExtValue();
+ ConstantOffsetExtractor Extractor(GEP->getIterator(), DT);
+ auto Offset = Extractor
+ .find(Idx, /* SignExtended */ false,
+ /* ZeroExtended */ false, GEP->isInBounds())
+ .getSExtValue();
+
+ // Return the disjoint offset for Xor.
+ if (Extractor.CachedXorInfo)
+ return Extractor.CachedXorInfo->AdjustedOffset;
+
+ return Offset;
}
bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToIndexSize(
@@ -866,7 +975,7 @@ SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
// Tries to extract a constant offset from this GEP index.
int64_t ConstantOffset =
- ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP);
+ ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP, DT);
if (ConstantOffset != 0) {
NeedsExtraction = true;
// A GEP may have multiple indices. We accumulate the extracted
@@ -1106,7 +1215,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
User *UserChainTail;
bool PreservesNUW;
Value *NewIdx = ConstantOffsetExtractor::Extract(
- OldIdx, GEP, UserChainTail, PreservesNUW);
+ OldIdx, GEP, UserChainTail, PreservesNUW, DT);
if (NewIdx != nullptr) {
// Switches to the index with the constant offset removed.
GEP->setOperand(I, NewIdx);
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll
new file mode 100644
index 0000000000000..a0d0de070e735
--- /dev/null
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep \
+; RUN: -S < %s | FileCheck %s
+
+; Test that xor idiom.
+; Xors with disjoint constants 4128,8224 and 12320 must be expressed in GEPs.
+; Xors with non-disjoint constants 2336 and 8480, should not be optimized.
+define amdgpu_kernel void @test1(i1 %0, ptr addrspace(3) %1) {
+; CHECK-LABEL: define amdgpu_kernel void @test1(
+; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT: [[TMP14:%.*]] = xor i32 [[TMP2]], 2336
+; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 8480
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP14]]
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP20]], i32 8192
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 16384
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP25]], i32 24576
+; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
+; CHECK-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP16]], align 16
+; CHECK-NEXT: [[TMP17:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP21]], align 16
+; CHECK-NEXT: [[TMP18:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
+; CHECK-NEXT: [[TMP19:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP15]], align 16
+; CHECK-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
+; CHECK-NEXT: [[TMP12:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]]
+; CHECK-NEXT: [[TMP22:%.*]] = fadd <8 x half> [[TMP17]], [[TMP18]]
+; CHECK-NEXT: [[TMP23:%.*]] = fadd <8 x half> [[TMP19]], [[TMP11]]
+; CHECK-NEXT: [[TMP24:%.*]] = fadd <8 x half> [[TMP12]], [[TMP22]]
+; CHECK-NEXT: [[TMP13:%.*]] = fadd <8 x half> [[TMP23]], [[TMP24]]
+; CHECK-NEXT: store <8 x half> [[TMP13]], ptr addrspace(3) [[TMP1]], align 16
+; CHECK-NEXT: ret void
+;
+entry:
+ %2 = select i1 %0, i32 0, i32 288
+ %3 = xor i32 %2, 32 // Base
+ %4 = xor i32 %2, 2336 // Not disjoint
+ %5 = xor i32 %2, 4128 // Disjoint
+ %6 = xor i32 %2, 8224 // Disjoint
+ %7 = xor i32 %2, 8480 // Not disjoint
+ %8 = xor i32 %2, 12320 // Disjoint
+ %9 = getelementptr half, ptr addrspace(3) %1, i32 %3
+ %10 = getelementptr half, ptr addrspace(3) %1, i32 %4
+ %11 = getelementptr half, ptr addrspace(3) %1, i32 %5
+ %12 = getelementptr half, ptr addrspace(3) %1, i32 %6
+ %13 = getelementptr half, ptr addrspace(3) %1, i32 %7
+ %14 = getelementptr half, ptr addrspace(3) %1, i32 %8
+ %15 = load <8 x half>, ptr addrspace(3) %9, align 16
+ %16 = load <8 x half>, ptr addrspace(3) %10, align 16
+ %17 = load <8 x half>, ptr addrspace(3) %11, align 16
+ %18 = load <8 x half>, ptr addrspace(3) %12, align 16
+ %19 = load <8 x half>, ptr addrspace(3) %13, align 16
+ %20 = load <8 x half>, ptr addrspace(3) %14, align 16
+ %21 = fadd <8 x half> %15, %16
+ %22 = fadd <8 x half> %17, %18
+ %23 = fadd <8 x half> %19, %20
+ %24 = fadd <8 x half> %21, %22
+ %25 = fadd <8 x half> %23, %24
+ store <8 x half> %25, ptr addrspace(3) %1, align 16
+ ret void
+}
>From a56ac2f27523f540a5ca286ef7905343450169f7 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sugundap at amd.com>
Date: Thu, 24 Jul 2025 11:08:38 -0500
Subject: [PATCH 02/10] Update lit test with comments
---
.../SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll
index a0d0de070e735..2cbf2ead2107e 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll
@@ -38,12 +38,12 @@ define amdgpu_kernel void @test1(i1 %0, ptr addrspace(3) %1) {
;
entry:
%2 = select i1 %0, i32 0, i32 288
- %3 = xor i32 %2, 32 // Base
- %4 = xor i32 %2, 2336 // Not disjoint
- %5 = xor i32 %2, 4128 // Disjoint
- %6 = xor i32 %2, 8224 // Disjoint
- %7 = xor i32 %2, 8480 // Not disjoint
- %8 = xor i32 %2, 12320 // Disjoint
+ %3 = xor i32 %2, 32 ; Base
+ %4 = xor i32 %2, 2336 ; Not disjoint
+ %5 = xor i32 %2, 4128 ; Disjoint
+ %6 = xor i32 %2, 8224 ; Disjoint
+ %7 = xor i32 %2, 8480 ; Not disjoint
+ %8 = xor i32 %2, 12320 ; Disjoint
%9 = getelementptr half, ptr addrspace(3) %1, i32 %3
%10 = getelementptr half, ptr addrspace(3) %1, i32 %4
%11 = getelementptr half, ptr addrspace(3) %1, i32 %5
>From 49bcd01bce48be7fa68cb130606ffa52c0e363c6 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sugundap at amd.com>
Date: Tue, 29 Jul 2025 12:49:41 -0500
Subject: [PATCH 03/10] Update the patch
---
.../Scalar/SeparateConstOffsetFromGEP.cpp | 167 ++++++---------
.../AMDGPU/xor-decompose.ll | 195 ++++++++++++++++++
.../AMDGPU/xor-idiom.ll | 66 ------
3 files changed, 256 insertions(+), 172 deletions(-)
create mode 100644 llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll
delete mode 100644 llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 203850c28787c..c6ce7859a1f31 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -238,17 +238,16 @@ class ConstantOffsetExtractor {
/// \p PreservesNUW Outputs whether the extraction allows preserving the
/// GEP's nuw flag, if it has one.
static Value *Extract(Value *Idx, GetElementPtrInst *GEP,
- User *&UserChainTail, bool &PreservesNUW,
- DominatorTree *DT);
+ User *&UserChainTail, bool &PreservesNUW);
/// Looks for a constant offset from the given GEP index without extracting
/// it. It returns the numeric value of the extracted constant offset (0 if
/// failed). The meaning of the arguments are the same as Extract.
- static int64_t Find(Value *Idx, GetElementPtrInst *GEP, DominatorTree *DT);
+ static int64_t Find(Value *Idx, GetElementPtrInst *GEP);
private:
- ConstantOffsetExtractor(BasicBlock::iterator InsertionPt, DominatorTree *DT)
- : IP(InsertionPt), DT(DT), DL(InsertionPt->getDataLayout()) {}
+ ConstantOffsetExtractor(BasicBlock::iterator InsertionPt)
+ : IP(InsertionPt), DL(InsertionPt->getDataLayout()) {}
/// Searches the expression that computes V for a non-zero constant C s.t.
/// V can be reassociated into the form V' + C. If the searching is
@@ -322,19 +321,9 @@ class ConstantOffsetExtractor {
bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
bool NonNegative);
- // Find the most dominating Xor with the same base operand.
- BinaryOperator *findDominatingXor(Value *BaseOperand,
- BinaryOperator *CurrentXor);
-
- /// Check if Xor instruction should be considered for optimization.
- bool shouldConsiderXor(BinaryOperator *XorInst);
-
- /// Cache the information about Xor idiom.
- struct XorRewriteInfo {
- llvm::BinaryOperator *BaseXor = nullptr;
- int64_t AdjustedOffset = 0;
- };
- std::optional<XorRewriteInfo> CachedXorInfo;
+ /// Check if Xor instruction should be considered and updated for
+ /// optimization.
+ bool shouldConsiderAndUpdateXor(BinaryOperator *XorInst);
/// The path from the constant offset to the old GEP index. e.g., if the GEP
/// index is "a * b + (c + 5)". After running function find, UserChain[0] will
@@ -351,8 +340,6 @@ class ConstantOffsetExtractor {
/// Insertion position of cloned instructions.
BasicBlock::iterator IP;
- DominatorTree *DT;
-
const DataLayout &DL;
};
@@ -549,9 +536,9 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
!cast<PossiblyDisjointInst>(BO)->isDisjoint())
return false;
- // Handle Xor idiom.
+ // Handle Xor decomposition.
if (BO->getOpcode() == Instruction::Xor)
- return shouldConsiderXor(BO);
+ return shouldConsiderAndUpdateXor(BO);
// FIXME: We don't currently support constants from the RHS of subs,
// when we are zero-extended, because we need a way to zero-extended
@@ -763,10 +750,6 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
"UserChain, so no one should be used more than "
"once");
- // Special case for Xor idiom.
- if (BO->getOpcode() == Instruction::Xor)
- return CachedXorInfo->BaseXor;
-
unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]);
Value *NextInChain = removeConstOffset(ChainIndex - 1);
@@ -807,77 +790,57 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
return NewBO;
}
-// Find the most dominating Xor with the same base operand.
-BinaryOperator *
-ConstantOffsetExtractor::findDominatingXor(Value *BaseOperand,
- BinaryOperator *CurrentXor) {
- BinaryOperator *MostDominatingXor = nullptr;
- // Iterate over all instructions that use the BaseOperand.
- for (User *U : BaseOperand->users()) {
- auto *CandidateXor = dyn_cast<BinaryOperator>(U);
-
- // Simple checks.
- if (!CandidateXor || CandidateXor == CurrentXor)
- continue;
-
- // Check if the binary operator is a Xor with constant.
- if (!match(CandidateXor, m_Xor(m_Specific(BaseOperand), m_ConstantInt())))
- continue;
-
- // After confirming the structure, check the dominance relationship.
- if (DT->dominates(CandidateXor, CurrentXor))
- // If we find a dominating Xor, keep it if it's the first one,
- // or if it dominates the best candidate we've found so far.
- if (!MostDominatingXor || DT->dominates(CandidateXor, MostDominatingXor))
- MostDominatingXor = CandidateXor;
- }
-
- return MostDominatingXor;
-}
-
-// Check if Xor should be considered.
-// Only the following idiom is considered.
-// Example:
-// %3 = xor i32 %2, 32
-// %4 = xor i32 %2, 8224
-// %6 = getelementptr half, ptr addrspace(3) %1, i32 %3
-// %7 = getelementptr half, ptr addrspace(3) %1, i32 %4
-// GEP that corresponds to %7, looks at the binary operator %4.
-// In order for %4 to be considered, it should have a dominating xor with
-// constant offset that is disjoint with an adjusted offset.
-// If disjoint, %4 = xor i32 %2, 8224 can be treated as %4 = add i32 %3, 8192
-bool ConstantOffsetExtractor::shouldConsiderXor(BinaryOperator *XorInst) {
-
- Value *BaseOperand = nullptr;
- ConstantInt *CurrentConst = nullptr;
- if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(CurrentConst))))
+/// \brief Tries to canonicalize a 'xor' with a constant into a form that is
+/// more amenable to address-mode matching.
+///
+/// The transformation rewrites `Base ^ Const` into
+/// `(Base ^ NonDisjointBits) ^ DisjointBits`.
+///
+/// `DisjointBits` are the bits set in `Const` operand that are known to be zero
+/// in `Base` operand. For these bits, the `xor` operation is equivalent to
+/// `add`, which exposes an offset that can be more easily folded into a memory
+/// access.
+///
+/// For example, if we know the low bit of `%ptr` is 0:
+/// `xor %ptr, 3` ; 3 is `0b11`
+/// becomes:
+/// `%tmp = xor %ptr, 2` ; NonDisjointBits is `0b10`
+/// `xor %tmp, 1` ; DisjointBits is `0b01`
+///
+/// The final `xor %tmp, 1` is an addition of 1.
+///
+/// \returns `true` if the transformation was applied, `false` otherwise.
+bool ConstantOffsetExtractor::shouldConsiderAndUpdateXor(
+ BinaryOperator *XorInst) {
+ Value *BaseOperand;
+ ConstantInt *XorConst;
+ if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConst))))
return false;
- // Find the most dominating Xor with the same base operand.
- BinaryOperator *DominatingXor = findDominatingXor(BaseOperand, XorInst);
- if (!DominatingXor)
- return false;
+ const SimplifyQuery SQ(DL);
+ const KnownBits BaseKnown = computeKnownBits(BaseOperand, SQ);
+ const APInt &ConstValue = XorConst->getValue();
- // We expect the dominating instruction to also be a 'xor-const'.
- ConstantInt *DominatingConst = nullptr;
- if (!match(DominatingXor,
- m_Xor(m_Specific(BaseOperand), m_ConstantInt(DominatingConst))))
+ // Check if any bits of the constant can be treated as disjoint
+ // (addition-like).
+ const APInt DisjointBits = ConstValue & BaseKnown.Zero;
+ if (DisjointBits.isZero())
return false;
- // Calculate the adjusted offset (difference between constants)
- APInt AdjustedOffset = CurrentConst->getValue() - DominatingConst->getValue();
+ // Split the xor into disjoint and non-disjoint parts.
+ const APInt NonDisjointBits = ConstValue & ~DisjointBits;
- // Check disjoint conditions
- // 1. AdjustedOffset and DominatingConst should be disjoint
- if ((AdjustedOffset & DominatingConst->getValue()) != 0)
- return false;
+ IRBuilder<> Builder(XorInst);
+ Type *Ty = XorConst->getType();
- // 2. DominatingXor and AdjustedOffset should be disjoint
- if (!MaskedValueIsZero(DominatingXor, AdjustedOffset, SimplifyQuery(DL), 0))
- return false;
+ // Transform: (base ^ constant) -> ((base ^ non_disjoint) ^ disjoint).
+ if (!NonDisjointBits.isZero()) {
+ Value *NewBase =
+ Builder.CreateXor(BaseOperand, ConstantInt::get(Ty, NonDisjointBits));
+ XorInst->setOperand(0, NewBase);
+ }
- // Cache the result.
- CachedXorInfo = XorRewriteInfo{DominatingXor, AdjustedOffset.getSExtValue()};
+ XorInst->setOperand(1, ConstantInt::get(Ty, DisjointBits));
return true;
}
@@ -906,8 +869,8 @@ static bool allowsPreservingNUW(const User *U) {
Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
User *&UserChainTail,
- bool &PreservesNUW, DominatorTree *DT) {
- ConstantOffsetExtractor Extractor(GEP->getIterator(), DT);
+ bool &PreservesNUW) {
+ ConstantOffsetExtractor Extractor(GEP->getIterator());
// Find a non-zero constant offset first.
APInt ConstantOffset =
Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
@@ -926,20 +889,12 @@ Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
return IdxWithoutConstOffset;
}
-int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP,
- DominatorTree *DT) {
+int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP) {
// If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative.
- ConstantOffsetExtractor Extractor(GEP->getIterator(), DT);
- auto Offset = Extractor
- .find(Idx, /* SignExtended */ false,
- /* ZeroExtended */ false, GEP->isInBounds())
- .getSExtValue();
-
- // Return the disjoint offset for Xor.
- if (Extractor.CachedXorInfo)
- return Extractor.CachedXorInfo->AdjustedOffset;
-
- return Offset;
+ return ConstantOffsetExtractor(GEP->getIterator())
+ .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
+ GEP->isInBounds())
+ .getSExtValue();
}
bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToIndexSize(
@@ -975,7 +930,7 @@ SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
// Tries to extract a constant offset from this GEP index.
int64_t ConstantOffset =
- ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP, DT);
+ ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP);
if (ConstantOffset != 0) {
NeedsExtraction = true;
// A GEP may have multiple indices. We accumulate the extracted
@@ -1215,7 +1170,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
User *UserChainTail;
bool PreservesNUW;
Value *NewIdx = ConstantOffsetExtractor::Extract(
- OldIdx, GEP, UserChainTail, PreservesNUW, DT);
+ OldIdx, GEP, UserChainTail, PreservesNUW);
if (NewIdx != nullptr) {
// Switches to the index with the constant offset removed.
GEP->setOperand(I, NewIdx);
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll
new file mode 100644
index 0000000000000..f7cd8f3139ae4
--- /dev/null
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll
@@ -0,0 +1,195 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; Test the xor with constant operand is decomposed in to gep.
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep \
+; RUN: -S < %s | FileCheck %s
+; Test the gvn pass eliminates the redundant xor instructions from decomposition.
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep,gvn \
+; RUN: -S < %s | FileCheck --check-prefix=GVN %s
+
+define amdgpu_kernel void @test1(i1 %0, ptr addrspace(3) %1) {
+; CHECK-LABEL: define amdgpu_kernel void @test1(
+; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP8]], i32 8192
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP10]], i32 16384
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP12]], i32 24576
+; CHECK-NEXT: [[TMP14:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
+; CHECK-NEXT: [[TMP15:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP9]], align 16
+; CHECK-NEXT: [[TMP16:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP11]], align 16
+; CHECK-NEXT: [[TMP17:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP13]], align 16
+; CHECK-NEXT: [[TMP18:%.*]] = fadd <8 x half> [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP19:%.*]] = fadd <8 x half> [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP20:%.*]] = fadd <8 x half> [[TMP18]], [[TMP19]]
+; CHECK-NEXT: store <8 x half> [[TMP20]], ptr addrspace(3) [[TMP1]], align 16
+; CHECK-NEXT: ret void
+;
+; GVN-LABEL: define amdgpu_kernel void @test1(
+; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; GVN-NEXT: [[ENTRY:.*:]]
+; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; GVN-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 8192
+; GVN-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 16384
+; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 24576
+; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
+; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
+; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
+; GVN-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
+; GVN-NEXT: [[TMP12:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]]
+; GVN-NEXT: [[TMP13:%.*]] = fadd <8 x half> [[TMP10]], [[TMP11]]
+; GVN-NEXT: [[TMP14:%.*]] = fadd <8 x half> [[TMP12]], [[TMP13]]
+; GVN-NEXT: store <8 x half> [[TMP14]], ptr addrspace(3) [[TMP1]], align 16
+; GVN-NEXT: ret void
+;
+entry:
+ %2 = select i1 %0, i32 0, i32 288
+ %3 = xor i32 %2, 32
+ %4 = xor i32 %2, 4128
+ %5 = xor i32 %2, 8224
+ %6 = xor i32 %2, 12320
+ %7 = getelementptr half, ptr addrspace(3) %1, i32 %3
+ %8 = getelementptr half, ptr addrspace(3) %1, i32 %4
+ %9 = getelementptr half, ptr addrspace(3) %1, i32 %5
+ %10 = getelementptr half, ptr addrspace(3) %1, i32 %6
+ %11 = load <8 x half>, ptr addrspace(3) %7, align 16
+ %12 = load <8 x half>, ptr addrspace(3) %8, align 16
+ %13 = load <8 x half>, ptr addrspace(3) %9, align 16
+ %14 = load <8 x half>, ptr addrspace(3) %10, align 16
+ %15 = fadd <8 x half> %11, %12
+ %16 = fadd <8 x half> %13, %14
+ %17 = fadd <8 x half> %15, %16
+ store <8 x half> %17, ptr addrspace(3) %1, align 16
+ ret void
+}
+
+define amdgpu_kernel void @test2(i1 %0, ptr addrspace(3) %1) {
+; CHECK-LABEL: define amdgpu_kernel void @test2(
+; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 24576
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP9]], i32 16384
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP11]], i32 8192
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]]
+; CHECK-NEXT: [[TMP14:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
+; CHECK-NEXT: [[TMP15:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP10]], align 16
+; CHECK-NEXT: [[TMP16:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP12]], align 16
+; CHECK-NEXT: [[TMP17:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP13]], align 16
+; CHECK-NEXT: [[TMP18:%.*]] = fadd <8 x half> [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP19:%.*]] = fadd <8 x half> [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP20:%.*]] = fadd <8 x half> [[TMP18]], [[TMP19]]
+; CHECK-NEXT: store <8 x half> [[TMP20]], ptr addrspace(3) [[TMP1]], align 16
+; CHECK-NEXT: ret void
+;
+; GVN-LABEL: define amdgpu_kernel void @test2(
+; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; GVN-NEXT: [[ENTRY:.*:]]
+; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; GVN-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 24576
+; GVN-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 16384
+; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 8192
+; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
+; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
+; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
+; GVN-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
+; GVN-NEXT: [[TMP12:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]]
+; GVN-NEXT: [[TMP13:%.*]] = fadd <8 x half> [[TMP10]], [[TMP11]]
+; GVN-NEXT: [[TMP14:%.*]] = fadd <8 x half> [[TMP12]], [[TMP13]]
+; GVN-NEXT: store <8 x half> [[TMP14]], ptr addrspace(3) [[TMP1]], align 16
+; GVN-NEXT: ret void
+;
+entry:
+ %2 = select i1 %0, i32 0, i32 288
+ %3 = xor i32 %2, 12320
+ %4 = xor i32 %2, 8224
+ %5 = xor i32 %2, 4128
+ %6 = xor i32 %2, 32
+ %7 = getelementptr half, ptr addrspace(3) %1, i32 %3
+ %8 = getelementptr half, ptr addrspace(3) %1, i32 %4
+ %9 = getelementptr half, ptr addrspace(3) %1, i32 %5
+ %10 = getelementptr half, ptr addrspace(3) %1, i32 %6
+ %11 = load <8 x half>, ptr addrspace(3) %7, align 16
+ %12 = load <8 x half>, ptr addrspace(3) %8, align 16
+ %13 = load <8 x half>, ptr addrspace(3) %9, align 16
+ %14 = load <8 x half>, ptr addrspace(3) %10, align 16
+ %15 = fadd <8 x half> %11, %12
+ %16 = fadd <8 x half> %13, %14
+ %17 = fadd <8 x half> %15, %16
+ store <8 x half> %17, ptr addrspace(3) %1, align 16
+ ret void
+}
+
+define amdgpu_kernel void @test3(i1 %0, ptr addrspace(3) %1) {
+; CHECK-LABEL: define amdgpu_kernel void @test3(
+; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 288
+; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 4096
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP9]], i32 8192
+; CHECK-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
+; CHECK-NEXT: [[TMP12:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
+; CHECK-NEXT: [[TMP13:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP10]], align 16
+; CHECK-NEXT: [[TMP14:%.*]] = fadd <8 x half> [[TMP11]], [[TMP12]]
+; CHECK-NEXT: [[TMP15:%.*]] = fadd <8 x half> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: store <8 x half> [[TMP15]], ptr addrspace(3) [[TMP1]], align 16
+; CHECK-NEXT: ret void
+;
+; GVN-LABEL: define amdgpu_kernel void @test3(
+; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; GVN-NEXT: [[ENTRY:.*:]]
+; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; GVN-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 288
+; GVN-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
+; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 4096
+; GVN-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP5]], i32 8192
+; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
+; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
+; GVN-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
+; GVN-NEXT: [[TMP12:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]]
+; GVN-NEXT: [[TMP13:%.*]] = fadd <8 x half> [[TMP11]], [[TMP12]]
+; GVN-NEXT: store <8 x half> [[TMP13]], ptr addrspace(3) [[TMP1]], align 16
+; GVN-NEXT: ret void
+;
+entry:
+ %2 = select i1 %0, i32 0, i32 288
+ %3 = xor i32 %2, 32
+ %4 = xor i32 %2, 2336
+ %5 = xor i32 %2, 4128
+ %6 = getelementptr half, ptr addrspace(3) %1, i32 %3
+ %7 = getelementptr half, ptr addrspace(3) %1, i32 %4
+ %8 = getelementptr half, ptr addrspace(3) %1, i32 %5
+ %9 = load <8 x half>, ptr addrspace(3) %6, align 16
+ %10 = load <8 x half>, ptr addrspace(3) %7, align 16
+ %11 = load <8 x half>, ptr addrspace(3) %8, align 16
+ %12 = fadd <8 x half> %9, %10
+ %13 = fadd <8 x half> %11, %12
+ store <8 x half> %13, ptr addrspace(3) %1, align 16
+ ret void
+}
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll
deleted file mode 100644
index 2cbf2ead2107e..0000000000000
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll
+++ /dev/null
@@ -1,66 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep \
-; RUN: -S < %s | FileCheck %s
-
-; Test that xor idiom.
-; Xors with disjoint constants 4128,8224 and 12320 must be expressed in GEPs.
-; Xors with non-disjoint constants 2336 and 8480, should not be optimized.
-define amdgpu_kernel void @test1(i1 %0, ptr addrspace(3) %1) {
-; CHECK-LABEL: define amdgpu_kernel void @test1(
-; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
-; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP14:%.*]] = xor i32 [[TMP2]], 2336
-; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 8480
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP14]]
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP20]], i32 8192
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 16384
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
-; CHECK-NEXT: [[TMP25:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP25]], i32 24576
-; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
-; CHECK-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP16]], align 16
-; CHECK-NEXT: [[TMP17:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP21]], align 16
-; CHECK-NEXT: [[TMP18:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
-; CHECK-NEXT: [[TMP19:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP15]], align 16
-; CHECK-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
-; CHECK-NEXT: [[TMP12:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]]
-; CHECK-NEXT: [[TMP22:%.*]] = fadd <8 x half> [[TMP17]], [[TMP18]]
-; CHECK-NEXT: [[TMP23:%.*]] = fadd <8 x half> [[TMP19]], [[TMP11]]
-; CHECK-NEXT: [[TMP24:%.*]] = fadd <8 x half> [[TMP12]], [[TMP22]]
-; CHECK-NEXT: [[TMP13:%.*]] = fadd <8 x half> [[TMP23]], [[TMP24]]
-; CHECK-NEXT: store <8 x half> [[TMP13]], ptr addrspace(3) [[TMP1]], align 16
-; CHECK-NEXT: ret void
-;
-entry:
- %2 = select i1 %0, i32 0, i32 288
- %3 = xor i32 %2, 32 ; Base
- %4 = xor i32 %2, 2336 ; Not disjoint
- %5 = xor i32 %2, 4128 ; Disjoint
- %6 = xor i32 %2, 8224 ; Disjoint
- %7 = xor i32 %2, 8480 ; Not disjoint
- %8 = xor i32 %2, 12320 ; Disjoint
- %9 = getelementptr half, ptr addrspace(3) %1, i32 %3
- %10 = getelementptr half, ptr addrspace(3) %1, i32 %4
- %11 = getelementptr half, ptr addrspace(3) %1, i32 %5
- %12 = getelementptr half, ptr addrspace(3) %1, i32 %6
- %13 = getelementptr half, ptr addrspace(3) %1, i32 %7
- %14 = getelementptr half, ptr addrspace(3) %1, i32 %8
- %15 = load <8 x half>, ptr addrspace(3) %9, align 16
- %16 = load <8 x half>, ptr addrspace(3) %10, align 16
- %17 = load <8 x half>, ptr addrspace(3) %11, align 16
- %18 = load <8 x half>, ptr addrspace(3) %12, align 16
- %19 = load <8 x half>, ptr addrspace(3) %13, align 16
- %20 = load <8 x half>, ptr addrspace(3) %14, align 16
- %21 = fadd <8 x half> %15, %16
- %22 = fadd <8 x half> %17, %18
- %23 = fadd <8 x half> %19, %20
- %24 = fadd <8 x half> %21, %22
- %25 = fadd <8 x half> %23, %24
- store <8 x half> %25, ptr addrspace(3) %1, align 16
- ret void
-}
>From 77869e80c91447aed23fceb85001e5b866a64ad8 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sugundap at amd.com>
Date: Wed, 30 Jul 2025 17:20:29 -0500
Subject: [PATCH 04/10] Update logic and add more tests
---
.../Scalar/SeparateConstOffsetFromGEP.cpp | 116 ++++----
.../AMDGPU/xor-decompose.ll | 260 +++++++++++++++---
2 files changed, 287 insertions(+), 89 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index c6ce7859a1f31..1605aaa3cd1f6 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -321,9 +321,9 @@ class ConstantOffsetExtractor {
bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
bool NonNegative);
- /// Check if Xor instruction should be considered and updated for
- /// optimization.
- bool shouldConsiderAndUpdateXor(BinaryOperator *XorInst);
+ /// Analyze XOR instruction to extract disjoint constant bits that behave
+ /// like addition operations for improved address mode folding.
+ APInt extractDisjointBitsFromXor(BinaryOperator *XorInst);
/// The path from the constant offset to the old GEP index. e.g., if the GEP
/// index is "a * b + (c + 5)". After running function find, UserChain[0] will
@@ -518,14 +518,12 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
bool ZeroExtended,
BinaryOperator *BO,
bool NonNegative) {
- // We only consider ADD, SUB, OR and XOR, because a non-zero constant found in
+ // We only consider ADD, SUB and OR, because a non-zero constant found in
// expressions composed of these operations can be easily hoisted as a
- // constant offset by reassociation. XOR is a special case and can be folded
- // in to gep if the constant is proven to be disjoint.
+ // constant offset by reassociation.
if (BO->getOpcode() != Instruction::Add &&
BO->getOpcode() != Instruction::Sub &&
- BO->getOpcode() != Instruction::Or &&
- BO->getOpcode() != Instruction::Xor) {
+ BO->getOpcode() != Instruction::Or) {
return false;
}
@@ -536,10 +534,6 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
!cast<PossiblyDisjointInst>(BO)->isDisjoint())
return false;
- // Handle Xor decomposition.
- if (BO->getOpcode() == Instruction::Xor)
- return shouldConsiderAndUpdateXor(BO);
-
// FIXME: We don't currently support constants from the RHS of subs,
// when we are zero-extended, because we need a way to zero-extended
// them before they are negated.
@@ -643,6 +637,9 @@ APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
// Trace into subexpressions for more hoisting opportunities.
if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative))
ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
+ // Handle XOR with disjoint bits that can be treated as addition.
+ else if (BO->getOpcode() == Instruction::Xor)
+ ConstantOffset = extractDisjointBitsFromXor(BO);
} else if (isa<TruncInst>(V)) {
ConstantOffset =
find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative)
@@ -755,11 +752,19 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
Value *NextInChain = removeConstOffset(ChainIndex - 1);
Value *TheOther = BO->getOperand(1 - OpNo);
- // If NextInChain is 0 and not the LHS of a sub, we can simplify the
- // sub-expression to be just TheOther.
if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
- if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0))
- return TheOther;
+ if (CI->isZero()) {
+ // Special handling for XOR with disjoint bits:
+ // Keep the original XOR instruction with the non disjoint part of
+ // the constant, and the remaining operation is still meaningful.
+ if (BO->getOpcode() == Instruction::Xor)
+ return BO;
+
+ // If NextInChain is 0 and not the LHS of a sub, we can simplify the
+ // sub-expression to be just TheOther.
+ if (!(BO->getOpcode() == Instruction::Sub && OpNo == 0))
+ return TheOther;
+ }
}
BinaryOperator::BinaryOps NewOp = BO->getOpcode();
@@ -790,58 +795,59 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
return NewBO;
}
-/// \brief Tries to canonicalize a 'xor' with a constant into a form that is
-/// more amenable to address-mode matching.
-///
-/// The transformation rewrites `Base ^ Const` into
-/// `(Base ^ NonDisjointBits) ^ DisjointBits`.
+/// Analyze XOR instruction to extract disjoint constant bits for address
+/// folding
///
-/// `DisjointBits` are the bits set in `Const` operand that are known to be zero
-/// in `Base` operand. For these bits, the `xor` operation is equivalent to
-/// `add`, which exposes an offset that can be more easily folded into a memory
-/// access.
+/// This function identifies bits in an XOR constant operand that are disjoint
+/// from the base operand's known set bits. For these disjoint bits, XOR behaves
+/// identically to addition, allowing us to extract them as constant offsets
+/// that can be folded into addressing modes.
///
-/// For example, if we know the low bit of `%ptr` is 0:
-/// `xor %ptr, 3` ; 3 is `0b11`
-/// becomes:
-/// `%tmp = xor %ptr, 2` ; NonDisjointBits is `0b10`
-/// `xor %tmp, 1` ; DisjointBits is `0b01`
+/// Transformation: `Base ^ Const` becomes `(Base ^ NonDisjointBits) +
+/// DisjointBits` where DisjointBits = Const & KnownZeros(Base)
///
-/// The final `xor %tmp, 1` is an addition of 1.
+/// Example with ptr having known-zero low bit:
+/// Original: `xor %ptr, 3` ; 3 = 0b11
+/// Analysis: DisjointBits = 3 & KnownZeros(%ptr) = 0b11 & 0b01 = 0b01
+/// Result: `(xor %ptr, 2) + 1` where 1 can be folded into address mode
///
-/// \returns `true` if the transformation was applied, `false` otherwise.
-bool ConstantOffsetExtractor::shouldConsiderAndUpdateXor(
+/// \param XorInst The XOR binary operator to analyze
+/// \return APInt containing the disjoint bits that can be extracted as offset,
+/// or zero if no disjoint bits exist
+APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
BinaryOperator *XorInst) {
+ assert(XorInst && XorInst->getOpcode() == Instruction::Xor &&
+ "Expected XOR instruction");
+
+ const unsigned BitWidth = XorInst->getType()->getScalarSizeInBits();
Value *BaseOperand;
- ConstantInt *XorConst;
- if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConst))))
- return false;
+ ConstantInt *XorConstant;
+
+ // Match pattern: xor BaseOperand, Constant.
+ if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant))))
+ return APInt::getZero(BitWidth);
+ // Compute known bits for the base operand.
const SimplifyQuery SQ(DL);
- const KnownBits BaseKnown = computeKnownBits(BaseOperand, SQ);
- const APInt &ConstValue = XorConst->getValue();
+ const KnownBits BaseKnownBits = computeKnownBits(BaseOperand, SQ);
+ const APInt &ConstantValue = XorConstant->getValue();
- // Check if any bits of the constant can be treated as disjoint
- // (addition-like).
- const APInt DisjointBits = ConstValue & BaseKnown.Zero;
- if (DisjointBits.isZero())
- return false;
+ // Identify disjoint bits: constant bits that are known zero in base.
+ const APInt DisjointBits = ConstantValue & BaseKnownBits.Zero;
- // Split the xor into disjoint and non-disjoint parts.
- const APInt NonDisjointBits = ConstValue & ~DisjointBits;
+ // Early exit if no disjoint bits found.
+ if (DisjointBits.isZero())
+ return APInt::getZero(BitWidth);
- IRBuilder<> Builder(XorInst);
- Type *Ty = XorConst->getType();
+ // Compute the remaining non-disjoint bits that stay in the XOR.
+ const APInt NonDisjointBits = ConstantValue & ~DisjointBits;
- // Transform: (base ^ constant) -> ((base ^ non_disjoint) ^ disjoint).
- if (!NonDisjointBits.isZero()) {
- Value *NewBase =
- Builder.CreateXor(BaseOperand, ConstantInt::get(Ty, NonDisjointBits));
- XorInst->setOperand(0, NewBase);
- }
+ // Add the non-disjoint constant to the user chain for later transformation
+ // This will replace the original constant in the XOR with the reduced
+ // constant.
+ UserChain.push_back(ConstantInt::get(XorInst->getType(), NonDisjointBits));
- XorInst->setOperand(1, ConstantInt::get(Ty, DisjointBits));
- return true;
+ return DisjointBits;
}
/// A helper function to check if reassociating through an entry in the user
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll
index f7cd8f3139ae4..df3a9180b1617 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll
@@ -6,25 +6,27 @@
; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep,gvn \
; RUN: -S < %s | FileCheck --check-prefix=GVN %s
+; Check that disjoint constants are properly extracted and folded into GEP
+; addressing modes and GVN to eliminate redundant computations
define amdgpu_kernel void @test1(i1 %0, ptr addrspace(3) %1) {
; CHECK-LABEL: define amdgpu_kernel void @test1(
; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP8]], i32 8192
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP10]], i32 16384
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 8192
+; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP9]], i32 16384
+; CHECK-NEXT: [[TMP11:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP11]]
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP12]], i32 24576
-; CHECK-NEXT: [[TMP14:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
-; CHECK-NEXT: [[TMP15:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP9]], align 16
-; CHECK-NEXT: [[TMP16:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP11]], align 16
+; CHECK-NEXT: [[TMP14:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
+; CHECK-NEXT: [[TMP15:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
+; CHECK-NEXT: [[TMP16:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP10]], align 16
; CHECK-NEXT: [[TMP17:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP13]], align 16
; CHECK-NEXT: [[TMP18:%.*]] = fadd <8 x half> [[TMP14]], [[TMP15]]
; CHECK-NEXT: [[TMP19:%.*]] = fadd <8 x half> [[TMP16]], [[TMP17]]
@@ -72,6 +74,8 @@ entry:
ret void
}
+; Check that disjoint constants are properly extracted and folded into GEP
+; addressing modes and GVN to eliminate redundant computations
define amdgpu_kernel void @test2(i1 %0, ptr addrspace(3) %1) {
; CHECK-LABEL: define amdgpu_kernel void @test2(
; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
@@ -79,17 +83,17 @@ define amdgpu_kernel void @test2(i1 %0, ptr addrspace(3) %1) {
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 24576
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP9]], i32 16384
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP5]], i32 24576
+; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP8]], i32 16384
+; CHECK-NEXT: [[TMP10:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP10]]
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP11]], i32 8192
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]]
-; CHECK-NEXT: [[TMP14:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
-; CHECK-NEXT: [[TMP15:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP10]], align 16
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT: [[TMP14:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
+; CHECK-NEXT: [[TMP15:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP9]], align 16
; CHECK-NEXT: [[TMP16:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP12]], align 16
; CHECK-NEXT: [[TMP17:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP13]], align 16
; CHECK-NEXT: [[TMP18:%.*]] = fadd <8 x half> [[TMP14]], [[TMP15]]
@@ -138,21 +142,22 @@ entry:
ret void
}
+; Verify that xor instructions with different non-disjoint constants are optimized
define amdgpu_kernel void @test3(i1 %0, ptr addrspace(3) %1) {
; CHECK-LABEL: define amdgpu_kernel void @test3(
; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 288
-; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 4096
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 288
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 4096
+; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP9]], i32 8192
-; CHECK-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
-; CHECK-NEXT: [[TMP12:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
+; CHECK-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
+; CHECK-NEXT: [[TMP12:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
; CHECK-NEXT: [[TMP13:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP10]], align 16
; CHECK-NEXT: [[TMP14:%.*]] = fadd <8 x half> [[TMP11]], [[TMP12]]
; CHECK-NEXT: [[TMP15:%.*]] = fadd <8 x half> [[TMP13]], [[TMP14]]
@@ -164,12 +169,12 @@ define amdgpu_kernel void @test3(i1 %0, ptr addrspace(3) %1) {
; GVN-NEXT: [[ENTRY:.*:]]
; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
-; GVN-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 288
-; GVN-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
+; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; GVN-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 288
+; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 4096
-; GVN-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP5]], i32 8192
-; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
+; GVN-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 8192
+; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
; GVN-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
; GVN-NEXT: [[TMP12:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]]
@@ -193,3 +198,190 @@ entry:
store <8 x half> %13, ptr addrspace(3) %1, align 16
ret void
}
+
+; Verify that no optimization occurs when disjoint constants are absent
+define amdgpu_kernel void @test4(i1 %0, ptr addrspace(3) %1) {
+; CHECK-LABEL: define amdgpu_kernel void @test4(
+; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 288
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
+; CHECK-NEXT: [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
+; CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
+; CHECK-NEXT: [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]]
+; CHECK-NEXT: store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16
+; CHECK-NEXT: ret void
+;
+; GVN-LABEL: define amdgpu_kernel void @test4(
+; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; GVN-NEXT: [[ENTRY:.*:]]
+; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; GVN-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 288
+; GVN-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
+; GVN-NEXT: [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
+; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
+; GVN-NEXT: [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]]
+; GVN-NEXT: store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16
+; GVN-NEXT: ret void
+;
+entry:
+ %2 = select i1 %0, i32 0, i32 288
+ %3 = xor i32 %2, 32
+ %4 = xor i32 %2, 288
+ %5 = getelementptr half, ptr addrspace(3) %1, i32 %3
+ %6 = getelementptr half, ptr addrspace(3) %1, i32 %4
+ %7 = load <8 x half>, ptr addrspace(3) %5, align 16
+ %8 = load <8 x half>, ptr addrspace(3) %6, align 16
+ %9 = fadd <8 x half> %7, %8
+ store <8 x half> %9, ptr addrspace(3) %1, align 16
+ ret void
+}
+
+
+; Verify that XOR-BinOp-GEP usage chains are properly optimized
+define amdgpu_kernel void @test5(i1 %0, ptr addrspace(3) %1) {
+; CHECK-LABEL: define amdgpu_kernel void @test5(
+; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 256
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 8192
+; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
+; CHECK-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
+; CHECK-NEXT: [[TMP11:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]]
+; CHECK-NEXT: store <8 x half> [[TMP11]], ptr addrspace(3) [[TMP1]], align 16
+; CHECK-NEXT: ret void
+;
+; GVN-LABEL: define amdgpu_kernel void @test5(
+; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; GVN-NEXT: [[ENTRY:.*:]]
+; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; GVN-NEXT: [[TMP5:%.*]] = add i32 [[TMP3]], 256
+; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
+; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 8192
+; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
+; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
+; GVN-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]]
+; GVN-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16
+; GVN-NEXT: ret void
+;
+entry:
+ %2 = select i1 %0, i32 0, i32 288
+ %3 = xor i32 %2, 32
+ %4 = xor i32 %2, 4128
+ %5 = add i32 %4, 256
+ %6 = getelementptr half, ptr addrspace(3) %1, i32 %3
+ %7 = getelementptr half, ptr addrspace(3) %1, i32 %5
+ %8 = load <8 x half>, ptr addrspace(3) %6, align 16
+ %9 = load <8 x half>, ptr addrspace(3) %7, align 16
+ %10 = fadd <8 x half> %8, %9
+ store <8 x half> %10, ptr addrspace(3) %1, align 16
+ ret void
+}
+
+
+; Verify that BinOp-XOR-GEP usage chains are properly optimized
+; This represents the common pattern found in real target workloads
+define amdgpu_kernel void @test6(i1 %0, ptr addrspace(3) %1) {
+; CHECK-LABEL: define amdgpu_kernel void @test6(
+; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], 256
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], 32
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 8192
+; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
+; CHECK-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
+; CHECK-NEXT: [[TMP11:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]]
+; CHECK-NEXT: store <8 x half> [[TMP11]], ptr addrspace(3) [[TMP1]], align 16
+; CHECK-NEXT: ret void
+;
+; GVN-LABEL: define amdgpu_kernel void @test6(
+; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; GVN-NEXT: [[ENTRY:.*:]]
+; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; GVN-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], 256
+; GVN-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; GVN-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], 32
+; GVN-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]]
+; GVN-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 8192
+; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
+; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
+; GVN-NEXT: [[TMP11:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]]
+; GVN-NEXT: store <8 x half> [[TMP11]], ptr addrspace(3) [[TMP1]], align 16
+; GVN-NEXT: ret void
+;
+entry:
+ %2 = select i1 %0, i32 0, i32 288
+ %3 = xor i32 %2, 32
+ %4 = add i32 %2, 256
+ %5 = xor i32 %4, 4128
+ %6 = getelementptr half, ptr addrspace(3) %1, i32 %3
+ %7 = getelementptr half, ptr addrspace(3) %1, i32 %5
+ %8 = load <8 x half>, ptr addrspace(3) %6, align 16
+ %9 = load <8 x half>, ptr addrspace(3) %7, align 16
+ %10 = fadd <8 x half> %8, %9
+ store <8 x half> %10, ptr addrspace(3) %1, align 16
+ ret void
+}
+
+
+; Ensure disjoint constants exceeding addressing mode limits (e.g., 32768) are
+; not extracted
+define amdgpu_kernel void @test7(i1 %0, ptr addrspace(3) %1) {
+; CHECK-LABEL: define amdgpu_kernel void @test7(
+; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 32800
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
+; CHECK-NEXT: [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
+; CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
+; CHECK-NEXT: [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]]
+; CHECK-NEXT: store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16
+; CHECK-NEXT: ret void
+;
+; GVN-LABEL: define amdgpu_kernel void @test7(
+; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; GVN-NEXT: [[ENTRY:.*:]]
+; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; GVN-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 32800
+; GVN-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
+; GVN-NEXT: [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
+; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
+; GVN-NEXT: [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]]
+; GVN-NEXT: store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16
+; GVN-NEXT: ret void
+;
+entry:
+ %2 = select i1 %0, i32 0, i32 288
+ %3 = xor i32 %2, 32
+ %4 = xor i32 %2, 32800
+ %5 = getelementptr half, ptr addrspace(3) %1, i32 %3
+ %6 = getelementptr half, ptr addrspace(3) %1, i32 %4
+ %7 = load <8 x half>, ptr addrspace(3) %5, align 16
+ %8 = load <8 x half>, ptr addrspace(3) %6, align 16
+ %9 = fadd <8 x half> %7, %8
+ store <8 x half> %9, ptr addrspace(3) %1, align 16
+ ret void
+}
+
>From fa0af2612cba7aae1835f80f888e540b8d6f4f47 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sugundap at amd.com>
Date: Thu, 31 Jul 2025 13:10:12 -0500
Subject: [PATCH 05/10] Fix recursion logic
---
.../Scalar/SeparateConstOffsetFromGEP.cpp | 8 +++++
.../AMDGPU/xor-decompose.ll | 36 +++++++++----------
2 files changed, 24 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 1605aaa3cd1f6..8533b645ba774 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -827,6 +827,14 @@ APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant))))
return APInt::getZero(BitWidth);
+ // Try to extract constant offset from the base operand recursively.
+ if (BinaryOperator *BO = dyn_cast<BinaryOperator>(BaseOperand)) {
+ APInt ConstantOffset = find(BO, /*SignExtended=*/false,
+ /*ZeroExtended=*/false, /*NonNegative=*/false);
+ if (ConstantOffset.isZero())
+ return ConstantOffset;
+ }
+
// Compute known bits for the base operand.
const SimplifyQuery SQ(DL);
const KnownBits BaseKnownBits = computeKnownBits(BaseOperand, SQ);
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll
index df3a9180b1617..d9f73c1d30cce 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll
@@ -290,24 +290,21 @@ entry:
ret void
}
-
; Verify that BinOp-XOR-GEP usage chains are properly optimized
-; This represents the common pattern found in real target workloads
define amdgpu_kernel void @test6(i1 %0, ptr addrspace(3) %1) {
; CHECK-LABEL: define amdgpu_kernel void @test6(
; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], 256
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], 32
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 8192
-; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
-; CHECK-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
-; CHECK-NEXT: [[TMP11:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]]
-; CHECK-NEXT: store <8 x half> [[TMP11]], ptr addrspace(3) [[TMP1]], align 16
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 4128
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 512
+; CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
+; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
+; CHECK-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]]
+; CHECK-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16
; CHECK-NEXT: ret void
;
; GVN-LABEL: define amdgpu_kernel void @test6(
@@ -315,15 +312,14 @@ define amdgpu_kernel void @test6(i1 %0, ptr addrspace(3) %1) {
; GVN-NEXT: [[ENTRY:.*:]]
; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
-; GVN-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], 256
-; GVN-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; GVN-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], 32
-; GVN-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]]
-; GVN-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 8192
-; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
-; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
-; GVN-NEXT: [[TMP11:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]]
-; GVN-NEXT: store <8 x half> [[TMP11]], ptr addrspace(3) [[TMP1]], align 16
+; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; GVN-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 4128
+; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
+; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 512
+; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
+; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
+; GVN-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]]
+; GVN-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16
; GVN-NEXT: ret void
;
entry:
>From 0f88c77e18531cf7860c2313e1b5f1df83be6bff Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sugundap at amd.com>
Date: Thu, 31 Jul 2025 13:22:17 -0500
Subject: [PATCH 06/10] Fix the typo error
---
llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 8533b645ba774..6e715a0160419 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -831,7 +831,7 @@ APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
if (BinaryOperator *BO = dyn_cast<BinaryOperator>(BaseOperand)) {
APInt ConstantOffset = find(BO, /*SignExtended=*/false,
/*ZeroExtended=*/false, /*NonNegative=*/false);
- if (ConstantOffset.isZero())
+ if (!ConstantOffset.isZero())
return ConstantOffset;
}
>From 378e5ffa943b90384014f1e25740676a9cdd5a57 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sugundap at amd.com>
Date: Fri, 1 Aug 2025 09:15:55 -0500
Subject: [PATCH 07/10] Update lit test
---
.../AMDGPU/xor-decompose.ll | 46 +++++++++++++++++++
1 file changed, 46 insertions(+)
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll
index d9f73c1d30cce..36900ba7e16bd 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll
@@ -336,6 +336,52 @@ entry:
ret void
}
+; Verify that BinOp-XOR-GEP usage chains with non disjoint xor works as
+; intended.
+define amdgpu_kernel void @test6a(i1 %0, ptr addrspace(3) %1) {
+; CHECK-LABEL: define amdgpu_kernel void @test6a(
+; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 288
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 512
+; CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
+; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
+; CHECK-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]]
+; CHECK-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16
+; CHECK-NEXT: ret void
+;
+; GVN-LABEL: define amdgpu_kernel void @test6a(
+; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
+; GVN-NEXT: [[ENTRY:.*:]]
+; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
+; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
+; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; GVN-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 288
+; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
+; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 512
+; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
+; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
+; GVN-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]]
+; GVN-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16
+; GVN-NEXT: ret void
+;
+entry:
+ %2 = select i1 %0, i32 0, i32 288
+ %3 = xor i32 %2, 32
+ %4 = add i32 %2, 256
+ %5 = xor i32 %4, 288
+ %6 = getelementptr half, ptr addrspace(3) %1, i32 %3
+ %7 = getelementptr half, ptr addrspace(3) %1, i32 %5
+ %8 = load <8 x half>, ptr addrspace(3) %6, align 16
+ %9 = load <8 x half>, ptr addrspace(3) %7, align 16
+ %10 = fadd <8 x half> %8, %9
+ store <8 x half> %10, ptr addrspace(3) %1, align 16
+ ret void
+}
; Ensure disjoint constants exceeding addressing mode limits (e.g., 32768) are
; not extracted
>From 81ab5297d1d140e3dc2a5c740c1a150121871603 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sugundap at amd.com>
Date: Mon, 4 Aug 2025 16:14:53 -0500
Subject: [PATCH 08/10] Fix the correctness bug in recursion
---
.../Scalar/SeparateConstOffsetFromGEP.cpp | 23 +++++++++++--------
.../AMDGPU/xor-decompose.ll | 20 ++++++++--------
2 files changed, 24 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 6e715a0160419..25c1020da15f3 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -1,4 +1,4 @@
-//===- SeparateConstOffsetFromGEP.cpp -------------------------------------===//
+//===- separateConstOffsetFromGEP.cpp -------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -827,14 +827,6 @@ APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant))))
return APInt::getZero(BitWidth);
- // Try to extract constant offset from the base operand recursively.
- if (BinaryOperator *BO = dyn_cast<BinaryOperator>(BaseOperand)) {
- APInt ConstantOffset = find(BO, /*SignExtended=*/false,
- /*ZeroExtended=*/false, /*NonNegative=*/false);
- if (!ConstantOffset.isZero())
- return ConstantOffset;
- }
-
// Compute known bits for the base operand.
const SimplifyQuery SQ(DL);
const KnownBits BaseKnownBits = computeKnownBits(BaseOperand, SQ);
@@ -847,6 +839,19 @@ APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
if (DisjointBits.isZero())
return APInt::getZero(BitWidth);
+ // Recursively extract constant offset from the base operand.
+ if (auto *BO = dyn_cast<BinaryOperator>(BaseOperand)) {
+ APInt ConstantOffset = find(BO, /*SignExtended=*/false,
+ /*ZeroExtended=*/false, /*NonNegative=*/false);
+
+ // (A binop B xor C) is not always equivalent with (A xor C binop B).
+ // These cases, might already be optimized out by instruction combine.
+ if (!(ConstantOffset & DisjointBits).isZero())
+ return APInt::getZero(BitWidth);
+
+ return ConstantOffset;
+ }
+
// Compute the remaining non-disjoint bits that stay in the XOR.
const APInt NonDisjointBits = ConstantValue & ~DisjointBits;
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll
index 36900ba7e16bd..514ce1718b0e0 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll
@@ -344,11 +344,11 @@ define amdgpu_kernel void @test6a(i1 %0, ptr addrspace(3) %1) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 288
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 512
-; CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
+; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], 256
+; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP4]], 288
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
+; CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
; CHECK-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]]
; CHECK-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16
@@ -359,11 +359,11 @@ define amdgpu_kernel void @test6a(i1 %0, ptr addrspace(3) %1) {
; GVN-NEXT: [[ENTRY:.*:]]
; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
-; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; GVN-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 288
-; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
-; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 512
-; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
+; GVN-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], 256
+; GVN-NEXT: [[TMP5:%.*]] = xor i32 [[TMP4]], 288
+; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
+; GVN-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
+; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
; GVN-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]]
; GVN-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16
>From 18ddd890a598e1732b9b0d4dba23a72ab88db2f4 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sugundap at amd.com>
Date: Mon, 4 Aug 2025 16:17:22 -0500
Subject: [PATCH 09/10] Fix a typo
---
llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 25c1020da15f3..c798013f0055c 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -1,4 +1,4 @@
-//===- separateConstOffsetFromGEP.cpp -------------------------------------===//
+//===- SeparateConstOffsetFromGEP.cpp -------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
>From 9fe29b4c144395dd7006ecaf989be74e617bf442 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sugundap at amd.com>
Date: Tue, 5 Aug 2025 09:45:39 -0500
Subject: [PATCH 10/10] Update code if there is no extractable offset in
recursion.
---
.../Scalar/SeparateConstOffsetFromGEP.cpp | 40 +++++++++++--------
1 file changed, 23 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index c798013f0055c..c2608c7ee9c72 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -839,28 +839,34 @@ APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
if (DisjointBits.isZero())
return APInt::getZero(BitWidth);
- // Recursively extract constant offset from the base operand.
- if (auto *BO = dyn_cast<BinaryOperator>(BaseOperand)) {
- APInt ConstantOffset = find(BO, /*SignExtended=*/false,
- /*ZeroExtended=*/false, /*NonNegative=*/false);
+ // Compute the remaining non-disjoint bits that stay in the XOR.
+ const APInt NonDisjointBits = ConstantValue & ~DisjointBits;
- // (A binop B xor C) is not always equivalent with (A xor C binop B).
- // These cases, might already be optimized out by instruction combine.
- if (!(ConstantOffset & DisjointBits).isZero())
- return APInt::getZero(BitWidth);
+ // Add non-disjoint bits to user chain and return.
+ auto addToUserChainAndReturn = [&]() -> APInt {
+ UserChain.push_back(ConstantInt::get(XorInst->getType(), NonDisjointBits));
+ return DisjointBits;
+ };
- return ConstantOffset;
- }
+ // Handle recursive extraction for binary operators.
+ auto *BO = dyn_cast<BinaryOperator>(BaseOperand);
+ if (!BO)
+ return addToUserChainAndReturn();
- // Compute the remaining non-disjoint bits that stay in the XOR.
- const APInt NonDisjointBits = ConstantValue & ~DisjointBits;
+ APInt ConstantOffset = find(BO, /*SignExtended=*/false,
+ /*ZeroExtended=*/false, /*NonNegative=*/false);
- // Add the non-disjoint constant to the user chain for later transformation
- // This will replace the original constant in the XOR with the reduced
- // constant.
- UserChain.push_back(ConstantInt::get(XorInst->getType(), NonDisjointBits));
+ // Add to chain and return if no further constant extraction possible.
+ if (ConstantOffset.isZero())
+ return addToUserChainAndReturn();
- return DisjointBits;
+ // Check for conflicts between extracted offset and disjoint bits
+ // (A binop B xor C) is not always equivalent with (A xor C binop B)
+ // These cases might already be optimized out by instruction combine
+ if (!(ConstantOffset & DisjointBits).isZero())
+ return APInt::getZero(BitWidth);
+
+ return ConstantOffset;
}
/// A helper function to check if reassociating through an entry in the user
More information about the llvm-commits
mailing list