[llvm-branch-commits] [llvm] Revert "[SeparateConstOffsetFromGEP] Decompose constant xor operand if possible (#150438)" (PR #179507)
Cullen Rhodes via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Feb 4 01:35:50 PST 2026
https://github.com/c-rhodes updated https://github.com/llvm/llvm-project/pull/179507
>From a1fd09748aa437cf581e618a1ed582e3c45c6e19 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma at qti.qualcomm.com>
Date: Tue, 3 Feb 2026 09:35:04 -0800
Subject: [PATCH] Revert "[SeparateConstOffsetFromGEP] Decompose constant xor
operand if possible (#150438)"
Cherry-pick of #179339 (a2c7c6032f27c4f8d6f7327a7ca15705d3081c3e).
---
.../Scalar/SeparateConstOffsetFromGEP.cpp | 85 +---
.../AMDGPU/xor-decompose.ll | 435 ------------------
2 files changed, 4 insertions(+), 516 deletions(-)
delete mode 100644 llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index dc47b243625b8..9934caef22a8f 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -295,10 +295,6 @@ class ConstantOffsetExtractor {
bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
bool NonNegative);
- /// Analyze XOR instruction to extract disjoint constant bits that behave
- /// like addition operations for improved address mode folding.
- APInt extractDisjointBitsFromXor(BinaryOperator *XorInst);
-
/// The path from the constant offset to the old GEP index. e.g., if the GEP
/// index is "a * b + (c + 5)". After running function find, UserChain[0] will
/// be the constant 5, UserChain[1] will be the subexpression "c + 5", and
@@ -601,9 +597,6 @@ APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
// Trace into subexpressions for more hoisting opportunities.
if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative))
ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
- // Handle XOR with disjoint bits that can be treated as addition.
- else if (BO->getOpcode() == Instruction::Xor)
- ConstantOffset = extractDisjointBitsFromXor(BO);
} else if (isa<TruncInst>(V)) {
ConstantOffset =
find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative)
@@ -723,20 +716,11 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
Value *NextInChain = removeConstOffset(ChainIndex - 1);
Value *TheOther = BO->getOperand(1 - OpNo);
+ // If NextInChain is 0 and not the LHS of a sub, we can simplify the
+ // sub-expression to be just TheOther.
if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
- if (CI->isZero()) {
- // Custom XOR handling for disjoint bits - preserves original XOR
- // with non-disjoint constant bits.
- // TODO: The design should be updated to support partial constant
- // extraction.
- if (BO->getOpcode() == Instruction::Xor)
- return BO;
-
- // If NextInChain is 0 and not the LHS of a sub, we can simplify the
- // sub-expression to be just TheOther.
- if (!(BO->getOpcode() == Instruction::Sub && OpNo == 0))
- return TheOther;
- }
+ if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0))
+ return TheOther;
}
BinaryOperator::BinaryOps NewOp = BO->getOpcode();
@@ -767,67 +751,6 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
return NewBO;
}
-/// Analyze XOR instruction to extract disjoint constant bits for address
-/// folding
-///
-/// This function identifies bits in an XOR constant operand that are disjoint
-/// from the base operand's known set bits. For these disjoint bits, XOR behaves
-/// identically to addition, allowing us to extract them as constant offsets
-/// that can be folded into addressing modes.
-///
-/// Transformation: `Base ^ Const` becomes `(Base ^ NonDisjointBits) +
-/// DisjointBits` where DisjointBits = Const & KnownZeros(Base)
-///
-/// Example with ptr having known-zero low bit:
-/// Original: `xor %ptr, 3` ; 3 = 0b11
-/// Analysis: DisjointBits = 3 & KnownZeros(%ptr) = 0b11 & 0b01 = 0b01
-/// Result: `(xor %ptr, 2) + 1` where 1 can be folded into address mode
-///
-/// \param XorInst The XOR binary operator to analyze
-/// \return APInt containing the disjoint bits that can be extracted as offset,
-/// or zero if no disjoint bits exist
-APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
- BinaryOperator *XorInst) {
- assert(XorInst && XorInst->getOpcode() == Instruction::Xor &&
- "Expected XOR instruction");
-
- const unsigned BitWidth = XorInst->getType()->getScalarSizeInBits();
- Value *BaseOperand;
- ConstantInt *XorConstant;
-
- // Match pattern: xor BaseOperand, Constant.
- if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant))))
- return APInt::getZero(BitWidth);
-
- // Compute known bits for the base operand.
- const SimplifyQuery SQ(DL);
- const KnownBits BaseKnownBits = computeKnownBits(BaseOperand, SQ);
- const APInt &ConstantValue = XorConstant->getValue();
-
- // Identify disjoint bits: constant bits that are known zero in base.
- const APInt DisjointBits = ConstantValue & BaseKnownBits.Zero;
-
- // Early exit if no disjoint bits found.
- if (DisjointBits.isZero())
- return APInt::getZero(BitWidth);
-
- // Compute the remaining non-disjoint bits that stay in the XOR.
- const APInt NonDisjointBits = ConstantValue & ~DisjointBits;
-
- // FIXME: Enhance XOR constant extraction to handle nested binary operations.
- // Currently we only extract disjoint bits from the immediate XOR constant,
- // but we could recursively process cases like:
- // xor (add %base, C1), C2 -> add %base, (C1 ^ disjoint_bits(C2))
- // This requires careful analysis to ensure the transformation preserves
- // semantics, particularly around sign extension and overflow behavior.
-
- // Add the non-disjoint constant to the user chain for later transformation
- // This will replace the original constant in the XOR with the new
- // constant.
- UserChain.push_back(ConstantInt::get(XorInst->getType(), NonDisjointBits));
- return DisjointBits;
-}
-
/// A helper function to check if reassociating through an entry in the user
/// chain would invalidate the GEP's nuw flag.
static bool allowsPreservingNUW(const User *U) {
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll
deleted file mode 100644
index 056f33e5ee367..0000000000000
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll
+++ /dev/null
@@ -1,435 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; Test the xor with constant operand is decomposed in to gep.
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep \
-; RUN: -S < %s | FileCheck %s
-; Test the gvn pass eliminates the redundant xor instructions from decomposition.
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep,gvn \
-; RUN: -S < %s | FileCheck --check-prefix=GVN %s
-
-; Check that disjoint constants are properly extracted and folded into GEP
-; addressing modes and GVN to eliminate redundant computations
-define amdgpu_kernel void @test1(i1 %0, ptr addrspace(3) %1) {
-; CHECK-LABEL: define amdgpu_kernel void @test1(
-; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
-; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 8192
-; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP9]], i32 16384
-; CHECK-NEXT: [[TMP11:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP11]]
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP12]], i32 24576
-; CHECK-NEXT: [[TMP14:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
-; CHECK-NEXT: [[TMP15:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
-; CHECK-NEXT: [[TMP16:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP10]], align 16
-; CHECK-NEXT: [[TMP17:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP13]], align 16
-; CHECK-NEXT: [[TMP18:%.*]] = fadd <8 x half> [[TMP14]], [[TMP15]]
-; CHECK-NEXT: [[TMP19:%.*]] = fadd <8 x half> [[TMP16]], [[TMP17]]
-; CHECK-NEXT: [[TMP20:%.*]] = fadd <8 x half> [[TMP18]], [[TMP19]]
-; CHECK-NEXT: store <8 x half> [[TMP20]], ptr addrspace(3) [[TMP1]], align 16
-; CHECK-NEXT: ret void
-;
-; GVN-LABEL: define amdgpu_kernel void @test1(
-; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
-; GVN-NEXT: [[ENTRY:.*:]]
-; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
-; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
-; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; GVN-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 8192
-; GVN-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 16384
-; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 24576
-; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
-; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
-; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
-; GVN-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
-; GVN-NEXT: [[TMP12:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]]
-; GVN-NEXT: [[TMP13:%.*]] = fadd <8 x half> [[TMP10]], [[TMP11]]
-; GVN-NEXT: [[TMP14:%.*]] = fadd <8 x half> [[TMP12]], [[TMP13]]
-; GVN-NEXT: store <8 x half> [[TMP14]], ptr addrspace(3) [[TMP1]], align 16
-; GVN-NEXT: ret void
-;
-entry:
- %2 = select i1 %0, i32 0, i32 288
- %3 = xor i32 %2, 32
- %4 = xor i32 %2, 4128
- %5 = xor i32 %2, 8224
- %6 = xor i32 %2, 12320
- %7 = getelementptr half, ptr addrspace(3) %1, i32 %3
- %8 = getelementptr half, ptr addrspace(3) %1, i32 %4
- %9 = getelementptr half, ptr addrspace(3) %1, i32 %5
- %10 = getelementptr half, ptr addrspace(3) %1, i32 %6
- %11 = load <8 x half>, ptr addrspace(3) %7, align 16
- %12 = load <8 x half>, ptr addrspace(3) %8, align 16
- %13 = load <8 x half>, ptr addrspace(3) %9, align 16
- %14 = load <8 x half>, ptr addrspace(3) %10, align 16
- %15 = fadd <8 x half> %11, %12
- %16 = fadd <8 x half> %13, %14
- %17 = fadd <8 x half> %15, %16
- store <8 x half> %17, ptr addrspace(3) %1, align 16
- ret void
-}
-
-; Check that disjoint constants are properly extracted and folded into GEP
-; addressing modes and GVN to eliminate redundant computations
-define amdgpu_kernel void @test2(i1 %0, ptr addrspace(3) %1) {
-; CHECK-LABEL: define amdgpu_kernel void @test2(
-; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
-; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP5]], i32 24576
-; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP7]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP8]], i32 16384
-; CHECK-NEXT: [[TMP10:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP11]], i32 8192
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; CHECK-NEXT: [[TMP14:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
-; CHECK-NEXT: [[TMP15:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP9]], align 16
-; CHECK-NEXT: [[TMP16:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP12]], align 16
-; CHECK-NEXT: [[TMP17:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP13]], align 16
-; CHECK-NEXT: [[TMP18:%.*]] = fadd <8 x half> [[TMP14]], [[TMP15]]
-; CHECK-NEXT: [[TMP19:%.*]] = fadd <8 x half> [[TMP16]], [[TMP17]]
-; CHECK-NEXT: [[TMP20:%.*]] = fadd <8 x half> [[TMP18]], [[TMP19]]
-; CHECK-NEXT: store <8 x half> [[TMP20]], ptr addrspace(3) [[TMP1]], align 16
-; CHECK-NEXT: ret void
-;
-; GVN-LABEL: define amdgpu_kernel void @test2(
-; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
-; GVN-NEXT: [[ENTRY:.*:]]
-; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
-; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
-; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; GVN-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 24576
-; GVN-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 16384
-; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 8192
-; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
-; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
-; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
-; GVN-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
-; GVN-NEXT: [[TMP12:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]]
-; GVN-NEXT: [[TMP13:%.*]] = fadd <8 x half> [[TMP10]], [[TMP11]]
-; GVN-NEXT: [[TMP14:%.*]] = fadd <8 x half> [[TMP12]], [[TMP13]]
-; GVN-NEXT: store <8 x half> [[TMP14]], ptr addrspace(3) [[TMP1]], align 16
-; GVN-NEXT: ret void
-;
-entry:
- %2 = select i1 %0, i32 0, i32 288
- %3 = xor i32 %2, 12320
- %4 = xor i32 %2, 8224
- %5 = xor i32 %2, 4128
- %6 = xor i32 %2, 32
- %7 = getelementptr half, ptr addrspace(3) %1, i32 %3
- %8 = getelementptr half, ptr addrspace(3) %1, i32 %4
- %9 = getelementptr half, ptr addrspace(3) %1, i32 %5
- %10 = getelementptr half, ptr addrspace(3) %1, i32 %6
- %11 = load <8 x half>, ptr addrspace(3) %7, align 16
- %12 = load <8 x half>, ptr addrspace(3) %8, align 16
- %13 = load <8 x half>, ptr addrspace(3) %9, align 16
- %14 = load <8 x half>, ptr addrspace(3) %10, align 16
- %15 = fadd <8 x half> %11, %12
- %16 = fadd <8 x half> %13, %14
- %17 = fadd <8 x half> %15, %16
- store <8 x half> %17, ptr addrspace(3) %1, align 16
- ret void
-}
-
-; Verify that xor instructions with different non-disjoint constants are optimized
-define amdgpu_kernel void @test3(i1 %0, ptr addrspace(3) %1) {
-; CHECK-LABEL: define amdgpu_kernel void @test3(
-; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
-; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 288
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 4096
-; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP9]], i32 8192
-; CHECK-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
-; CHECK-NEXT: [[TMP12:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
-; CHECK-NEXT: [[TMP13:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP10]], align 16
-; CHECK-NEXT: [[TMP14:%.*]] = fadd <8 x half> [[TMP11]], [[TMP12]]
-; CHECK-NEXT: [[TMP15:%.*]] = fadd <8 x half> [[TMP13]], [[TMP14]]
-; CHECK-NEXT: store <8 x half> [[TMP15]], ptr addrspace(3) [[TMP1]], align 16
-; CHECK-NEXT: ret void
-;
-; GVN-LABEL: define amdgpu_kernel void @test3(
-; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
-; GVN-NEXT: [[ENTRY:.*:]]
-; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
-; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
-; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; GVN-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 288
-; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
-; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 4096
-; GVN-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 8192
-; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
-; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
-; GVN-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
-; GVN-NEXT: [[TMP12:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]]
-; GVN-NEXT: [[TMP13:%.*]] = fadd <8 x half> [[TMP11]], [[TMP12]]
-; GVN-NEXT: store <8 x half> [[TMP13]], ptr addrspace(3) [[TMP1]], align 16
-; GVN-NEXT: ret void
-;
-entry:
- %2 = select i1 %0, i32 0, i32 288
- %3 = xor i32 %2, 32
- %4 = xor i32 %2, 2336
- %5 = xor i32 %2, 4128
- %6 = getelementptr half, ptr addrspace(3) %1, i32 %3
- %7 = getelementptr half, ptr addrspace(3) %1, i32 %4
- %8 = getelementptr half, ptr addrspace(3) %1, i32 %5
- %9 = load <8 x half>, ptr addrspace(3) %6, align 16
- %10 = load <8 x half>, ptr addrspace(3) %7, align 16
- %11 = load <8 x half>, ptr addrspace(3) %8, align 16
- %12 = fadd <8 x half> %9, %10
- %13 = fadd <8 x half> %11, %12
- store <8 x half> %13, ptr addrspace(3) %1, align 16
- ret void
-}
-
-; Verify that no optimization occurs when disjoint constants are absent
-define amdgpu_kernel void @test4(i1 %0, ptr addrspace(3) %1) {
-; CHECK-LABEL: define amdgpu_kernel void @test4(
-; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
-; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 288
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
-; CHECK-NEXT: [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
-; CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
-; CHECK-NEXT: [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]]
-; CHECK-NEXT: store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16
-; CHECK-NEXT: ret void
-;
-; GVN-LABEL: define amdgpu_kernel void @test4(
-; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
-; GVN-NEXT: [[ENTRY:.*:]]
-; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
-; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
-; GVN-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 288
-; GVN-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
-; GVN-NEXT: [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
-; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
-; GVN-NEXT: [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]]
-; GVN-NEXT: store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16
-; GVN-NEXT: ret void
-;
-entry:
- %2 = select i1 %0, i32 0, i32 288
- %3 = xor i32 %2, 32
- %4 = xor i32 %2, 288
- %5 = getelementptr half, ptr addrspace(3) %1, i32 %3
- %6 = getelementptr half, ptr addrspace(3) %1, i32 %4
- %7 = load <8 x half>, ptr addrspace(3) %5, align 16
- %8 = load <8 x half>, ptr addrspace(3) %6, align 16
- %9 = fadd <8 x half> %7, %8
- store <8 x half> %9, ptr addrspace(3) %1, align 16
- ret void
-}
-
-
-; Verify that XOR-BinOp-GEP usage chains are properly optimized
-define amdgpu_kernel void @test5(i1 %0, ptr addrspace(3) %1) {
-; CHECK-LABEL: define amdgpu_kernel void @test5(
-; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
-; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 256
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 8192
-; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
-; CHECK-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
-; CHECK-NEXT: [[TMP11:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]]
-; CHECK-NEXT: store <8 x half> [[TMP11]], ptr addrspace(3) [[TMP1]], align 16
-; CHECK-NEXT: ret void
-;
-; GVN-LABEL: define amdgpu_kernel void @test5(
-; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
-; GVN-NEXT: [[ENTRY:.*:]]
-; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
-; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
-; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; GVN-NEXT: [[TMP5:%.*]] = add i32 [[TMP3]], 256
-; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
-; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 8192
-; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16
-; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
-; GVN-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]]
-; GVN-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16
-; GVN-NEXT: ret void
-;
-entry:
- %2 = select i1 %0, i32 0, i32 288
- %3 = xor i32 %2, 32
- %4 = xor i32 %2, 4128
- %5 = add i32 %4, 256
- %6 = getelementptr half, ptr addrspace(3) %1, i32 %3
- %7 = getelementptr half, ptr addrspace(3) %1, i32 %5
- %8 = load <8 x half>, ptr addrspace(3) %6, align 16
- %9 = load <8 x half>, ptr addrspace(3) %7, align 16
- %10 = fadd <8 x half> %8, %9
- store <8 x half> %10, ptr addrspace(3) %1, align 16
- ret void
-}
-
-; Verify that BinOp-XOR-GEP usage chains are properly optimized.
-; In the below test, make sure we stop processing the chain at xor
-; and not fold the constant from add instruction in to gep. The
-; constant from add can be folded and the future work will cover
-; these cases.
-define amdgpu_kernel void @test6(i1 %0, ptr addrspace(3) %1) {
-; CHECK-LABEL: define amdgpu_kernel void @test6(
-; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
-; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], 256
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], 32
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 8192
-; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
-; CHECK-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
-; CHECK-NEXT: [[TMP11:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]]
-; CHECK-NEXT: store <8 x half> [[TMP11]], ptr addrspace(3) [[TMP1]], align 16
-; CHECK-NEXT: ret void
-;
-; GVN-LABEL: define amdgpu_kernel void @test6(
-; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
-; GVN-NEXT: [[ENTRY:.*:]]
-; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
-; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
-; GVN-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], 256
-; GVN-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; GVN-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], 32
-; GVN-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]]
-; GVN-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 8192
-; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
-; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16
-; GVN-NEXT: [[TMP11:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]]
-; GVN-NEXT: store <8 x half> [[TMP11]], ptr addrspace(3) [[TMP1]], align 16
-; GVN-NEXT: ret void
-;
-entry:
- %2 = select i1 %0, i32 0, i32 288
- %3 = xor i32 %2, 32
- %4 = add i32 %2, 256
- %5 = xor i32 %4, 4128
- %6 = getelementptr half, ptr addrspace(3) %1, i32 %3
- %7 = getelementptr half, ptr addrspace(3) %1, i32 %5
- %8 = load <8 x half>, ptr addrspace(3) %6, align 16
- %9 = load <8 x half>, ptr addrspace(3) %7, align 16
- %10 = fadd <8 x half> %8, %9
- store <8 x half> %10, ptr addrspace(3) %1, align 16
- ret void
-}
-
-; Verify that BinOp-XOR-GEP usage chains with non disjoint xor works as
-; intended.
-define amdgpu_kernel void @test6a(i1 %0, ptr addrspace(3) %1) {
-; CHECK-LABEL: define amdgpu_kernel void @test6a(
-; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
-; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], 256
-; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP4]], 288
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
-; CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
-; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
-; CHECK-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]]
-; CHECK-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16
-; CHECK-NEXT: ret void
-;
-; GVN-LABEL: define amdgpu_kernel void @test6a(
-; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
-; GVN-NEXT: [[ENTRY:.*:]]
-; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
-; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
-; GVN-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], 256
-; GVN-NEXT: [[TMP5:%.*]] = xor i32 [[TMP4]], 288
-; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; GVN-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]]
-; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
-; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16
-; GVN-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]]
-; GVN-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16
-; GVN-NEXT: ret void
-;
-entry:
- %2 = select i1 %0, i32 0, i32 288
- %3 = xor i32 %2, 32
- %4 = add i32 %2, 256
- %5 = xor i32 %4, 288
- %6 = getelementptr half, ptr addrspace(3) %1, i32 %3
- %7 = getelementptr half, ptr addrspace(3) %1, i32 %5
- %8 = load <8 x half>, ptr addrspace(3) %6, align 16
- %9 = load <8 x half>, ptr addrspace(3) %7, align 16
- %10 = fadd <8 x half> %8, %9
- store <8 x half> %10, ptr addrspace(3) %1, align 16
- ret void
-}
-
-; Ensure disjoint constants exceeding addressing mode limits (e.g., 32768) are
-; not extracted
-define amdgpu_kernel void @test7(i1 %0, ptr addrspace(3) %1) {
-; CHECK-LABEL: define amdgpu_kernel void @test7(
-; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
-; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
-; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 32800
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
-; CHECK-NEXT: [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
-; CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
-; CHECK-NEXT: [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]]
-; CHECK-NEXT: store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16
-; CHECK-NEXT: ret void
-;
-; GVN-LABEL: define amdgpu_kernel void @test7(
-; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) {
-; GVN-NEXT: [[ENTRY:.*:]]
-; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288
-; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32
-; GVN-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 32800
-; GVN-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]]
-; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]]
-; GVN-NEXT: [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16
-; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16
-; GVN-NEXT: [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]]
-; GVN-NEXT: store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16
-; GVN-NEXT: ret void
-;
-entry:
- %2 = select i1 %0, i32 0, i32 288
- %3 = xor i32 %2, 32
- %4 = xor i32 %2, 32800
- %5 = getelementptr half, ptr addrspace(3) %1, i32 %3
- %6 = getelementptr half, ptr addrspace(3) %1, i32 %4
- %7 = load <8 x half>, ptr addrspace(3) %5, align 16
- %8 = load <8 x half>, ptr addrspace(3) %6, align 16
- %9 = fadd <8 x half> %7, %8
- store <8 x half> %9, ptr addrspace(3) %1, align 16
- ret void
-}
-
More information about the llvm-branch-commits
mailing list