[clang] [clang-tools-extra] [llvm] [SeperateConstOffsetFromGEP] Handle `or disjoint` flags (PR #76997)
Krzysztof Drewniak via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 25 14:53:32 PST 2024
https://github.com/krzysz00 updated https://github.com/llvm/llvm-project/pull/76997
>From 5cc46862df42e7d01a2d45ccc18f221744af0b93 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Thu, 4 Jan 2024 20:20:54 +0000
Subject: [PATCH 1/2] [SeperateConstOffsetFromGEP] Handle `or disjoint` flags
This commit extends separate-const-offset-from-gep to look at the
newly-added `disjoint` flag on `or` instructions so as to preserve
addditional opportunities for optimization.
As with other `or disjoint`-handling commits, this does not remove the
existing check for the or's operands having no bits in common because
`disjoint` is currently not inferred.
The tests were pre-committed in #76972.
---
.../Scalar/SeparateConstOffsetFromGEP.cpp | 13 ++++++++-----
.../split-gep-or-as-add.ll | 8 +++++---
2 files changed, 13 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 225dd454068c84..9bb42f7be8d70a 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -174,6 +174,7 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
@@ -519,12 +520,14 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
}
Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
- // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS
- // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS).
- // FIXME: this does not appear to be covered by any tests
- // (with x86/aarch64 backends at least)
+ // Do not trace into "or" unless it is equivalent to "add".
+ // This is the case if the "or"'s disjoint flag is set, or (because we
+ // currently don't infer the disjoint flags) if its left and right operands
+ // have nothing in commen.
if (BO->getOpcode() == Instruction::Or &&
- !haveNoCommonBitsSet(LHS, RHS, SimplifyQuery(DL, DT, /*AC*/ nullptr, BO)))
+ !(cast<PossiblyDisjointInst>(BO)->isDisjoint() ||
+ haveNoCommonBitsSet(LHS, RHS,
+ SimplifyQuery(DL, DT, /*AC*/ nullptr, BO))))
return false;
// FIXME: We don't currently support constants from the RHS of subs,
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/split-gep-or-as-add.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/split-gep-or-as-add.ll
index 45154f5a68f92c..5041fed12f5963 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/split-gep-or-as-add.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/split-gep-or-as-add.ll
@@ -46,9 +46,11 @@ define void @testDisjointOrSplits(ptr %p) {
; CHECK-LABEL: define void @testDisjointOrSplits(
; CHECK-SAME: ptr [[P:%.*]]) {
; CHECK-NEXT: [[VAR:%.*]] = tail call i64 @foo()
-; CHECK-NEXT: [[OFF:%.*]] = or disjoint i64 [[VAR]], 10
-; CHECK-NEXT: [[Q:%.*]] = getelementptr i8, ptr [[P]], i64 [[OFF]]
-; CHECK-NEXT: store i8 0, ptr [[Q]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], [[VAR]]
+; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], 10
+; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT: store i8 0, ptr [[TMP4]], align 1
; CHECK-NEXT: ret void
;
%var = tail call i64 @foo()
>From af2a388a837adaf8829a71ec863716051afb8324 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Thu, 25 Jan 2024 22:52:24 +0000
Subject: [PATCH 2/2] Remove old check, update tests.
All tests that were relying on haveNoBitsInCommon() checks have been
in SeparateConstOffsetFromGEP have been updated to have the relevant
`or`s annotated with `disjoint`.
---
.../Scalar/SeparateConstOffsetFromGEP.cpp | 33 +++++++------------
.../AMDGPU/GlobalISel/merge-buffer-stores.ll | 24 +++++++-------
.../AMDGPU/constant-address-space-32bit.ll | 4 +--
.../CodeGen/AMDGPU/extract-subvector-16bit.ll | 16 ++++-----
...ne-sink-temporal-divergence-swdev407790.ll | 2 +-
llvm/test/CodeGen/NVPTX/vector-loads.ll | 8 ++---
llvm/test/CodeGen/PowerPC/mma-intrinsics.ll | 2 +-
llvm/test/CodeGen/PowerPC/sched-addi.ll | 2 +-
...-gep-and-gvn-addrspace-addressing-modes.ll | 2 +-
.../NVPTX/split-gep.ll | 4 +--
.../split-gep-or-as-add.ll | 15 +++++----
11 files changed, 52 insertions(+), 60 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 65a96727c3b541..4481375054ecf1 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -236,18 +236,16 @@ class ConstantOffsetExtractor {
/// \p UserChainTail Outputs the tail of UserChain so that we can
/// garbage-collect unused instructions in UserChain.
static Value *Extract(Value *Idx, GetElementPtrInst *GEP,
- User *&UserChainTail, const DominatorTree *DT);
+ User *&UserChainTail);
/// Looks for a constant offset from the given GEP index without extracting
/// it. It returns the numeric value of the extracted constant offset (0 if
/// failed). The meaning of the arguments are the same as Extract.
- static int64_t Find(Value *Idx, GetElementPtrInst *GEP,
- const DominatorTree *DT);
+ static int64_t Find(Value *Idx, GetElementPtrInst *GEP);
private:
- ConstantOffsetExtractor(Instruction *InsertionPt, const DominatorTree *DT)
- : IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()), DT(DT) {
- }
+ ConstantOffsetExtractor(Instruction *InsertionPt)
+ : IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()) {}
/// Searches the expression that computes V for a non-zero constant C s.t.
/// V can be reassociated into the form V' + C. If the searching is
@@ -337,7 +335,6 @@ class ConstantOffsetExtractor {
Instruction *IP;
const DataLayout &DL;
- const DominatorTree *DT;
};
/// A pass that tries to split every GEP in the function into a variadic
@@ -521,13 +518,9 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
// Do not trace into "or" unless it is equivalent to "add".
- // This is the case if the "or"'s disjoint flag is set, or (because we
- // currently don't infer the disjoint flags) if its left and right operands
- // have nothing in commen.
+ // This is the case if the or's disjoint flag is set.
if (BO->getOpcode() == Instruction::Or &&
- !(cast<PossiblyDisjointInst>(BO)->isDisjoint() ||
- haveNoCommonBitsSet(LHS, RHS,
- SimplifyQuery(DL, DT, /*AC*/ nullptr, BO))))
+ !cast<PossiblyDisjointInst>(BO)->isDisjoint())
return false;
// FIXME: We don't currently support constants from the RHS of subs,
@@ -781,9 +774,8 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
}
Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
- User *&UserChainTail,
- const DominatorTree *DT) {
- ConstantOffsetExtractor Extractor(GEP, DT);
+ User *&UserChainTail) {
+ ConstantOffsetExtractor Extractor(GEP);
// Find a non-zero constant offset first.
APInt ConstantOffset =
Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
@@ -798,10 +790,9 @@ Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
return IdxWithoutConstOffset;
}
-int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP,
- const DominatorTree *DT) {
+int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP) {
// If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative.
- return ConstantOffsetExtractor(GEP, DT)
+ return ConstantOffsetExtractor(GEP)
.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
GEP->isInBounds())
.getSExtValue();
@@ -839,7 +830,7 @@ SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
// Tries to extract a constant offset from this GEP index.
int64_t ConstantOffset =
- ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP, DT);
+ ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP);
if (ConstantOffset != 0) {
NeedsExtraction = true;
// A GEP may have multiple indices. We accumulate the extracted
@@ -1029,7 +1020,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
Value *OldIdx = GEP->getOperand(I);
User *UserChainTail;
Value *NewIdx =
- ConstantOffsetExtractor::Extract(OldIdx, GEP, UserChainTail, DT);
+ ConstantOffsetExtractor::Extract(OldIdx, GEP, UserChainTail);
if (NewIdx != nullptr) {
// Switches to the index with the constant offset removed.
GEP->setOperand(I, NewIdx);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll
index 0a51de3cdf20b4..9e58b716adb1ae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll
@@ -19,17 +19,17 @@ define amdgpu_cs void @test1(i32 %arg1, <4 x i32> inreg %arg2, i32, ptr addrspac
%ad1 = ptrtoint ptr addrspace(6) %ep1 to i32
call void @llvm.amdgcn.raw.buffer.store.i32(i32 11, <4 x i32> %arg2, i32 %ad1, i32 0, i32 0)
- %bs2 = or i32 %bs1, 1
+ %bs2 = or disjoint i32 %bs1, 1
%ep2 = getelementptr i32, ptr addrspace(6) %arg3, i32 %bs2
%ad2 = ptrtoint ptr addrspace(6) %ep2 to i32
call void @llvm.amdgcn.raw.buffer.store.i32(i32 22, <4 x i32> %arg2, i32 %ad2, i32 0, i32 0)
- %bs3 = or i32 %bs1, 2
+ %bs3 = or disjoint i32 %bs1, 2
%ep3 = getelementptr i32, ptr addrspace(6) %arg3, i32 %bs3
%ad3 = ptrtoint ptr addrspace(6) %ep3 to i32
call void @llvm.amdgcn.raw.buffer.store.i32(i32 33, <4 x i32> %arg2, i32 %ad3, i32 0, i32 0)
- %bs4 = or i32 %bs1, 3
+ %bs4 = or disjoint i32 %bs1, 3
%ep4 = getelementptr i32, ptr addrspace(6) %arg3, i32 %bs4
%ad4 = ptrtoint ptr addrspace(6) %ep4 to i32
call void @llvm.amdgcn.raw.buffer.store.i32(i32 44, <4 x i32> %arg2, i32 %ad4, i32 0, i32 0)
@@ -55,17 +55,17 @@ define amdgpu_cs void @test1_ptr(i32 %arg1, ptr addrspace(8) inreg %arg2, i32, p
%ad1 = ptrtoint ptr addrspace(6) %ep1 to i32
call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 11, ptr addrspace(8) %arg2, i32 %ad1, i32 0, i32 0)
- %bs2 = or i32 %bs1, 1
+ %bs2 = or disjoint i32 %bs1, 1
%ep2 = getelementptr i32, ptr addrspace(6) %arg3, i32 %bs2
%ad2 = ptrtoint ptr addrspace(6) %ep2 to i32
call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 22, ptr addrspace(8) %arg2, i32 %ad2, i32 0, i32 0)
- %bs3 = or i32 %bs1, 2
+ %bs3 = or disjoint i32 %bs1, 2
%ep3 = getelementptr i32, ptr addrspace(6) %arg3, i32 %bs3
%ad3 = ptrtoint ptr addrspace(6) %ep3 to i32
call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 33, ptr addrspace(8) %arg2, i32 %ad3, i32 0, i32 0)
- %bs4 = or i32 %bs1, 3
+ %bs4 = or disjoint i32 %bs1, 3
%ep4 = getelementptr i32, ptr addrspace(6) %arg3, i32 %bs4
%ad4 = ptrtoint ptr addrspace(6) %ep4 to i32
call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 44, ptr addrspace(8) %arg2, i32 %ad4, i32 0, i32 0)
@@ -90,17 +90,17 @@ define amdgpu_cs void @test2(i32 %arg1, <4 x i32> inreg %arg2) {
%ad1 = ptrtoint ptr addrspace(6) %ep1 to i32
call void @llvm.amdgcn.raw.buffer.store.i32(i32 11, <4 x i32> %arg2, i32 %ad1, i32 0, i32 0)
- %bs2 = or i32 %bs1, 1
+ %bs2 = or disjoint i32 %bs1, 1
%ep2 = getelementptr <{ [64 x i32] }>, ptr addrspace(6) null, i32 0, i32 0, i32 %bs2
%ad2 = ptrtoint ptr addrspace(6) %ep2 to i32
call void @llvm.amdgcn.raw.buffer.store.i32(i32 22, <4 x i32> %arg2, i32 %ad2, i32 0, i32 0)
- %bs3 = or i32 %bs1, 2
+ %bs3 = or disjoint i32 %bs1, 2
%ep3 = getelementptr <{ [64 x i32] }>, ptr addrspace(6) null, i32 0, i32 0, i32 %bs3
%ad3 = ptrtoint ptr addrspace(6) %ep3 to i32
call void @llvm.amdgcn.raw.buffer.store.i32(i32 33, <4 x i32> %arg2, i32 %ad3, i32 0, i32 0)
- %bs4 = or i32 %bs1, 3
+ %bs4 = or disjoint i32 %bs1, 3
%ep4 = getelementptr <{ [64 x i32] }>, ptr addrspace(6) null, i32 0, i32 0, i32 %bs4
%ad4 = ptrtoint ptr addrspace(6) %ep4 to i32
call void @llvm.amdgcn.raw.buffer.store.i32(i32 44, <4 x i32> %arg2, i32 %ad4, i32 0, i32 0)
@@ -125,17 +125,17 @@ define amdgpu_cs void @test2_ptr(i32 %arg1, ptr addrspace(8) inreg %arg2) {
%ad1 = ptrtoint ptr addrspace(6) %ep1 to i32
call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 11, ptr addrspace(8) %arg2, i32 %ad1, i32 0, i32 0)
- %bs2 = or i32 %bs1, 1
+ %bs2 = or disjoint i32 %bs1, 1
%ep2 = getelementptr <{ [64 x i32] }>, ptr addrspace(6) null, i32 0, i32 0, i32 %bs2
%ad2 = ptrtoint ptr addrspace(6) %ep2 to i32
call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 22, ptr addrspace(8) %arg2, i32 %ad2, i32 0, i32 0)
- %bs3 = or i32 %bs1, 2
+ %bs3 = or disjoint i32 %bs1, 2
%ep3 = getelementptr <{ [64 x i32] }>, ptr addrspace(6) null, i32 0, i32 0, i32 %bs3
%ad3 = ptrtoint ptr addrspace(6) %ep3 to i32
call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 33, ptr addrspace(8) %arg2, i32 %ad3, i32 0, i32 0)
- %bs4 = or i32 %bs1, 3
+ %bs4 = or disjoint i32 %bs1, 3
%ep4 = getelementptr <{ [64 x i32] }>, ptr addrspace(6) null, i32 0, i32 0, i32 %bs4
%ad4 = ptrtoint ptr addrspace(6) %ep4 to i32
call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 44, ptr addrspace(8) %arg2, i32 %ad4, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
index a89a2bb28b87bb..8cb7d6651a08c2 100644
--- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
@@ -238,7 +238,7 @@ main_body:
%25 = getelementptr inbounds [0 x <8 x i32>], ptr addrspace(6) %1, i32 0, i32 %24, !amdgpu.uniform !0
%26 = load <8 x i32>, ptr addrspace(6) %25, align 32, !invariant.load !0
%27 = shl i32 %23, 2
- %28 = or i32 %27, 3
+ %28 = or disjoint i32 %27, 3
%29 = getelementptr inbounds [0 x <4 x i32>], ptr addrspace(6) %1, i32 0, i32 %28, !amdgpu.uniform !0
%30 = load <4 x i32>, ptr addrspace(6) %29, align 16, !invariant.load !0
%31 = call nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> %26, <4 x i32> %30, i1 0, i32 0, i32 0) #8
@@ -270,7 +270,7 @@ main_body:
%25 = getelementptr inbounds [0 x <8 x i32>], ptr addrspace(6) %1, i32 0, i32 %24
%26 = load <8 x i32>, ptr addrspace(6) %25, align 32, !invariant.load !0
%27 = shl i32 %23, 2
- %28 = or i32 %27, 3
+ %28 = or disjoint i32 %27, 3
%29 = getelementptr inbounds [0 x <4 x i32>], ptr addrspace(6) %1, i32 0, i32 %28
%30 = load <4 x i32>, ptr addrspace(6) %29, align 16, !invariant.load !0
%31 = call nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> %26, <4 x i32> %30, i1 0, i32 0, i32 0) #8
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index 4202edfbd0eb45..069c57e2ae63e2 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -1157,38 +1157,38 @@ define <8 x i16> @large_vector(ptr addrspace(3) %p, i32 %idxp) {
; GFX11-NEXT: s_setpc_b64 s[30:31]
%idx = shl i32 %idxp, 4
- %i.0 = or i32 %idx, 0
+ %i.0 = or disjoint i32 %idx, 0
%p.0 = getelementptr half, ptr addrspace(3) %p, i32 %i.0
%x.0 = load i16, ptr addrspace(3) %p.0, align 4
%v0p = insertelement <8 x i16> poison, i16 %x.0, i32 0
- %i.1 = or i32 %idx, 1
+ %i.1 = or disjoint i32 %idx, 1
%p.1 = getelementptr half, ptr addrspace(3) %p, i32 %i.1
%x.1 = load i16, ptr addrspace(3) %p.1, align 2
%v0 = insertelement <8 x i16> %v0p, i16 %x.1, i32 1
- %i.2 = or i32 %idx, 2
+ %i.2 = or disjoint i32 %idx, 2
%p.2 = getelementptr half, ptr addrspace(3) %p, i32 %i.2
%x.2 = load i16, ptr addrspace(3) %p.2, align 4
%v1p = insertelement <8 x i16> poison, i16 %x.2, i32 0
- %i.3 = or i32 %idx, 3
+ %i.3 = or disjoint i32 %idx, 3
%p.3 = getelementptr half, ptr addrspace(3) %p, i32 %i.3
%x.3 = load i16, ptr addrspace(3) %p.3, align 2
%v1 = insertelement <8 x i16> %v1p, i16 %x.3, i32 1
- %i.4 = or i32 %idx, 4
+ %i.4 = or disjoint i32 %idx, 4
%p.4 = getelementptr half, ptr addrspace(3) %p, i32 %i.4
%x.4 = load i16, ptr addrspace(3) %p.4, align 4
%v2p = insertelement <8 x i16> poison, i16 %x.4, i32 0
- %i.5 = or i32 %idx, 5
+ %i.5 = or disjoint i32 %idx, 5
%p.5 = getelementptr half, ptr addrspace(3) %p, i32 %i.5
%x.5 = load i16, ptr addrspace(3) %p.5, align 2
%v2 = insertelement <8 x i16> %v2p, i16 %x.5, i32 1
- %i.6 = or i32 %idx, 6
+ %i.6 = or disjoint i32 %idx, 6
%p.6 = getelementptr half, ptr addrspace(3) %p, i32 %i.6
%x.6 = load i16, ptr addrspace(3) %p.6, align 4
%v3p = insertelement <8 x i16> poison, i16 %x.6, i32 0
- %i.7 = or i32 %idx, 7
+ %i.7 = or disjoint i32 %idx, 7
%p.7 = getelementptr half, ptr addrspace(3) %p, i32 %i.7
%x.7 = load i16, ptr addrspace(3) %p.7, align 2
%v3 = insertelement <8 x i16> %v3p, i16 %x.7, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index 0f2b2aa4d3562b..d9f6ce0b4c8513 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -732,7 +732,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
%192 = and i64 %191, 4294967168
%193 = getelementptr inbounds i8, ptr addrspace(1) %1, i64 %192
%194 = shl nuw nsw i32 %178, 5
- %195 = or i32 %194, 8
+ %195 = or disjoint i32 %194, 8
%196 = zext i32 %195 to i64
%197 = getelementptr inbounds i8, ptr addrspace(1) %193, i64 %196
%198 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 -4
diff --git a/llvm/test/CodeGen/NVPTX/vector-loads.ll b/llvm/test/CodeGen/NVPTX/vector-loads.ll
index 672c313cf5d194..9322b9e0fe6c82 100644
--- a/llvm/test/CodeGen/NVPTX/vector-loads.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-loads.ll
@@ -78,11 +78,11 @@ define void @foo_complex(ptr nocapture readonly align 16 dereferenceable(1342177
%t3 = shl nuw nsw i32 %t1, 9
%ttile_origin.2 = and i32 %t3, 130560
%tstart_offset_x_mul = shl nuw nsw i32 %t0, 1
- %t4 = or i32 %ttile_origin.2, %tstart_offset_x_mul
- %t6 = or i32 %t4, 1
- %t8 = or i32 %t4, 128
+ %t4 = or disjoint i32 %ttile_origin.2, %tstart_offset_x_mul
+ %t6 = or disjoint i32 %t4, 1
+ %t8 = or disjoint i32 %t4, 128
%t9 = zext i32 %t8 to i64
- %t10 = or i32 %t4, 129
+ %t10 = or disjoint i32 %t4, 129
%t11 = zext i32 %t10 to i64
%t20 = zext i32 %t2 to i64
%t27 = getelementptr inbounds [1024 x [131072 x i8]], ptr %alloc0, i64 0, i64 %t20, i64 %t9
diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
index 92d67bafd51832..53b0a2737122e1 100644
--- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
@@ -496,7 +496,7 @@ for.body: ; preds = %for.body, %for.body
%idxprom = zext i32 %mul to i64
%arrayidx = getelementptr inbounds <16 x i8>, ptr %vc, i64 %idxprom
%4 = load <16 x i8>, ptr %arrayidx, align 16
- %add2 = or i32 %mul, 1
+ %add2 = or disjoint i32 %mul, 1
%idxprom3 = zext i32 %add2 to i64
%arrayidx4 = getelementptr inbounds <16 x i8>, ptr %vc, i64 %idxprom3
%5 = load <16 x i8>, ptr %arrayidx4, align 16
diff --git a/llvm/test/CodeGen/PowerPC/sched-addi.ll b/llvm/test/CodeGen/PowerPC/sched-addi.ll
index ce6679ab7bb3d5..65cc47be28ee14 100644
--- a/llvm/test/CodeGen/PowerPC/sched-addi.ll
+++ b/llvm/test/CodeGen/PowerPC/sched-addi.ll
@@ -99,7 +99,7 @@ entry:
vector.body:
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
- %offset.idx = or i64 %index, 1
+ %offset.idx = or disjoint i64 %index, 1
%0 = getelementptr %_elem_type_of_x, ptr %x_rvo_based_addr_3, i64 %offset.idx, i32 0
%1 = getelementptr %_elem_type_of_a, ptr %a_rvo_based_addr_5, i64 %offset.idx, i32 0
%wide.load = load <4 x double>, ptr %1, align 8
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
index 427681ac724ee5..83e36eba36c842 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
@@ -157,7 +157,7 @@ main_body:
%25 = getelementptr [0 x <8 x i32>], ptr addrspace(4) %1, i32 0, i32 %24, !amdgpu.uniform !0
%26 = load <8 x i32>, ptr addrspace(4) %25, align 32, !invariant.load !0
%27 = shl i32 %23, 2
- %28 = or i32 %27, 3
+ %28 = or disjoint i32 %27, 3
%29 = getelementptr [0 x <4 x i32>], ptr addrspace(4) %1, i32 0, i32 %28, !amdgpu.uniform !0
%30 = load <4 x i32>, ptr addrspace(4) %29, align 16, !invariant.load !0
%31 = call nsz <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> zeroinitializer, <8 x i32> %26, <4 x i32> %30, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #8
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
index 49c6a46b136d52..c915b9a5e59ac5 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
@@ -142,7 +142,7 @@ define ptr @sext_or(i64 %a, i32 %b) {
;
entry:
%b1 = shl i32 %b, 2
- %b2 = or i32 %b1, 1 ; (b << 2) and 1 have no common bits
+ %b2 = or disjoint i32 %b1, 1 ; (b << 2) and 1 have no common bits
%b3 = or i32 %b1, 4 ; (b << 2) and 4 may have common bits
%b2.ext = zext i32 %b2 to i64
%b3.ext = sext i32 %b3 to i64
@@ -335,7 +335,7 @@ define ptr @shl_add_or(i64 %a, ptr %ptr) {
entry:
%shl = shl i64 %a, 2
%add = add i64 %shl, 12
- %or = or i64 %add, 1
+ %or = or disjoint i64 %add, 1
; ((a << 2) + 12) and 1 have no common bits. Therefore,
; SeparateConstOffsetFromGEP is able to extract the 12.
; TODO(jingyue): We could reassociate the expression to combine 12 and 1.
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/split-gep-or-as-add.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/split-gep-or-as-add.ll
index 5041fed12f5963..e405bbd5347ee0 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/split-gep-or-as-add.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/split-gep-or-as-add.ll
@@ -22,16 +22,17 @@ define void @testOrDoesntSplit(ptr %p) {
ret void
}
-define void @testNoBitsInCommonOrSplits(ptr %p) {
-; CHECK-LABEL: define void @testNoBitsInCommonOrSplits(
+; COM: The check for `or disjoint` removed the old hasNoBitsInCommon()
+; COM: check, ensure that failing to annotate an or with disjoint makes
+; COM: the optimization fail.
+define void @testNoBitsInCommonOrDoesntSplit(ptr %p) {
+; CHECK-LABEL: define void @testNoBitsInCommonOrDoesntSplit(
; CHECK-SAME: ptr [[P:%.*]]) {
; CHECK-NEXT: [[VAR:%.*]] = tail call i64 @foo()
; CHECK-NEXT: [[VAR_HIGH:%.*]] = and i64 [[VAR]], -16
-; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
-; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], [[VAR_HIGH]]
-; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], 10
-; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; CHECK-NEXT: store i8 0, ptr [[TMP4]], align 1
+; CHECK-NEXT: [[OFF:%.*]] = or i64 [[VAR_HIGH]], 10
+; CHECK-NEXT: [[Q:%.*]] = getelementptr i8, ptr [[P]], i64 [[OFF]]
+; CHECK-NEXT: store i8 0, ptr [[Q]], align 1
; CHECK-NEXT: ret void
;
%var = tail call i64 @foo()
More information about the llvm-commits
mailing list