[clang] [clang-tools-extra] [llvm] [SeperateConstOffsetFromGEP] Handle `or disjoint` flags (PR #76997)

Krzysztof Drewniak via cfe-commits cfe-commits at lists.llvm.org
Thu Jan 25 14:53:32 PST 2024


https://github.com/krzysz00 updated https://github.com/llvm/llvm-project/pull/76997

>From 5cc46862df42e7d01a2d45ccc18f221744af0b93 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Thu, 4 Jan 2024 20:20:54 +0000
Subject: [PATCH 1/2] [SeperateConstOffsetFromGEP] Handle `or disjoint` flags

This commit extends separate-const-offset-from-gep to look at the
newly-added `disjoint` flag on `or` instructions so as to preserve
addditional opportunities for optimization.

As with other `or disjoint`-handling commits, this does not remove the
existing check for the or's operands having no bits in common because
`disjoint` is currently not inferred.

The tests were pre-committed in #76972.
---
 .../Scalar/SeparateConstOffsetFromGEP.cpp           | 13 ++++++++-----
 .../split-gep-or-as-add.ll                          |  8 +++++---
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 225dd454068c84..9bb42f7be8d70a 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -174,6 +174,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
@@ -519,12 +520,14 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
   }
 
   Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
-  // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS
-  // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS).
-  // FIXME: this does not appear to be covered by any tests
-  //        (with x86/aarch64 backends at least)
+  // Do not trace into "or" unless it is equivalent to "add".
+  // This is the case if the "or"'s disjoint flag is set, or (because we
+  // currently don't infer the disjoint flags) if its left and right operands
+  // have nothing in commen.
   if (BO->getOpcode() == Instruction::Or &&
-      !haveNoCommonBitsSet(LHS, RHS, SimplifyQuery(DL, DT, /*AC*/ nullptr, BO)))
+      !(cast<PossiblyDisjointInst>(BO)->isDisjoint() ||
+        haveNoCommonBitsSet(LHS, RHS,
+                            SimplifyQuery(DL, DT, /*AC*/ nullptr, BO))))
     return false;
 
   // FIXME: We don't currently support constants from the RHS of subs,
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/split-gep-or-as-add.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/split-gep-or-as-add.ll
index 45154f5a68f92c..5041fed12f5963 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/split-gep-or-as-add.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/split-gep-or-as-add.ll
@@ -46,9 +46,11 @@ define void @testDisjointOrSplits(ptr %p) {
 ; CHECK-LABEL: define void @testDisjointOrSplits(
 ; CHECK-SAME: ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[VAR:%.*]] = tail call i64 @foo()
-; CHECK-NEXT:    [[OFF:%.*]] = or disjoint i64 [[VAR]], 10
-; CHECK-NEXT:    [[Q:%.*]] = getelementptr i8, ptr [[P]], i64 [[OFF]]
-; CHECK-NEXT:    store i8 0, ptr [[Q]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], [[VAR]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], 10
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    store i8 0, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    ret void
 ;
   %var = tail call i64 @foo()

>From af2a388a837adaf8829a71ec863716051afb8324 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Thu, 25 Jan 2024 22:52:24 +0000
Subject: [PATCH 2/2] Remove old check, update tests.

All tests that were relying on haveNoBitsInCommon() checks have been
in SeparateConstOffsetFromGEP have been updated to have the relevant
`or`s annotated with `disjoint`.
---
 .../Scalar/SeparateConstOffsetFromGEP.cpp     | 33 +++++++------------
 .../AMDGPU/GlobalISel/merge-buffer-stores.ll  | 24 +++++++-------
 .../AMDGPU/constant-address-space-32bit.ll    |  4 +--
 .../CodeGen/AMDGPU/extract-subvector-16bit.ll | 16 ++++-----
 ...ne-sink-temporal-divergence-swdev407790.ll |  2 +-
 llvm/test/CodeGen/NVPTX/vector-loads.ll       |  8 ++---
 llvm/test/CodeGen/PowerPC/mma-intrinsics.ll   |  2 +-
 llvm/test/CodeGen/PowerPC/sched-addi.ll       |  2 +-
 ...-gep-and-gvn-addrspace-addressing-modes.ll |  2 +-
 .../NVPTX/split-gep.ll                        |  4 +--
 .../split-gep-or-as-add.ll                    | 15 +++++----
 11 files changed, 52 insertions(+), 60 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 65a96727c3b541..4481375054ecf1 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -236,18 +236,16 @@ class ConstantOffsetExtractor {
   /// \p UserChainTail Outputs the tail of UserChain so that we can
   ///                  garbage-collect unused instructions in UserChain.
   static Value *Extract(Value *Idx, GetElementPtrInst *GEP,
-                        User *&UserChainTail, const DominatorTree *DT);
+                        User *&UserChainTail);
 
   /// Looks for a constant offset from the given GEP index without extracting
   /// it. It returns the numeric value of the extracted constant offset (0 if
   /// failed). The meaning of the arguments are the same as Extract.
-  static int64_t Find(Value *Idx, GetElementPtrInst *GEP,
-                      const DominatorTree *DT);
+  static int64_t Find(Value *Idx, GetElementPtrInst *GEP);
 
 private:
-  ConstantOffsetExtractor(Instruction *InsertionPt, const DominatorTree *DT)
-      : IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()), DT(DT) {
-  }
+  ConstantOffsetExtractor(Instruction *InsertionPt)
+      : IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()) {}
 
   /// Searches the expression that computes V for a non-zero constant C s.t.
   /// V can be reassociated into the form V' + C. If the searching is
@@ -337,7 +335,6 @@ class ConstantOffsetExtractor {
   Instruction *IP;
 
   const DataLayout &DL;
-  const DominatorTree *DT;
 };
 
 /// A pass that tries to split every GEP in the function into a variadic
@@ -521,13 +518,9 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
 
   Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
   // Do not trace into "or" unless it is equivalent to "add".
-  // This is the case if the "or"'s disjoint flag is set, or (because we
-  // currently don't infer the disjoint flags) if its left and right operands
-  // have nothing in commen.
+  // This is the case if the or's disjoint flag is set.
   if (BO->getOpcode() == Instruction::Or &&
-      !(cast<PossiblyDisjointInst>(BO)->isDisjoint() ||
-        haveNoCommonBitsSet(LHS, RHS,
-                            SimplifyQuery(DL, DT, /*AC*/ nullptr, BO))))
+      !cast<PossiblyDisjointInst>(BO)->isDisjoint())
     return false;
 
   // FIXME: We don't currently support constants from the RHS of subs,
@@ -781,9 +774,8 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
 }
 
 Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
-                                        User *&UserChainTail,
-                                        const DominatorTree *DT) {
-  ConstantOffsetExtractor Extractor(GEP, DT);
+                                        User *&UserChainTail) {
+  ConstantOffsetExtractor Extractor(GEP);
   // Find a non-zero constant offset first.
   APInt ConstantOffset =
       Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
@@ -798,10 +790,9 @@ Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
   return IdxWithoutConstOffset;
 }
 
-int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP,
-                                      const DominatorTree *DT) {
+int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP) {
   // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative.
-  return ConstantOffsetExtractor(GEP, DT)
+  return ConstantOffsetExtractor(GEP)
       .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
             GEP->isInBounds())
       .getSExtValue();
@@ -839,7 +830,7 @@ SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
 
       // Tries to extract a constant offset from this GEP index.
       int64_t ConstantOffset =
-          ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP, DT);
+          ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP);
       if (ConstantOffset != 0) {
         NeedsExtraction = true;
         // A GEP may have multiple indices.  We accumulate the extracted
@@ -1029,7 +1020,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
       Value *OldIdx = GEP->getOperand(I);
       User *UserChainTail;
       Value *NewIdx =
-          ConstantOffsetExtractor::Extract(OldIdx, GEP, UserChainTail, DT);
+          ConstantOffsetExtractor::Extract(OldIdx, GEP, UserChainTail);
       if (NewIdx != nullptr) {
         // Switches to the index with the constant offset removed.
         GEP->setOperand(I, NewIdx);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll
index 0a51de3cdf20b4..9e58b716adb1ae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll
@@ -19,17 +19,17 @@ define amdgpu_cs void @test1(i32 %arg1, <4 x i32> inreg %arg2, i32, ptr addrspac
   %ad1 = ptrtoint ptr addrspace(6) %ep1 to i32
   call void @llvm.amdgcn.raw.buffer.store.i32(i32 11, <4 x i32> %arg2, i32 %ad1, i32 0, i32 0)
 
-  %bs2 = or i32 %bs1, 1
+  %bs2 = or disjoint i32 %bs1, 1
   %ep2 = getelementptr i32, ptr addrspace(6) %arg3, i32 %bs2
   %ad2 = ptrtoint ptr addrspace(6) %ep2 to i32
   call void @llvm.amdgcn.raw.buffer.store.i32(i32 22, <4 x i32> %arg2, i32 %ad2, i32 0, i32 0)
 
-  %bs3 = or i32 %bs1, 2
+  %bs3 = or disjoint i32 %bs1, 2
   %ep3 = getelementptr i32, ptr addrspace(6) %arg3, i32 %bs3
   %ad3 = ptrtoint ptr addrspace(6) %ep3 to i32
   call void @llvm.amdgcn.raw.buffer.store.i32(i32 33, <4 x i32> %arg2, i32 %ad3, i32 0, i32 0)
 
-  %bs4 = or i32 %bs1, 3
+  %bs4 = or disjoint i32 %bs1, 3
   %ep4 = getelementptr i32, ptr addrspace(6) %arg3, i32 %bs4
   %ad4 = ptrtoint ptr addrspace(6) %ep4 to i32
   call void @llvm.amdgcn.raw.buffer.store.i32(i32 44, <4 x i32> %arg2, i32 %ad4, i32 0, i32 0)
@@ -55,17 +55,17 @@ define amdgpu_cs void @test1_ptr(i32 %arg1, ptr addrspace(8) inreg %arg2, i32, p
   %ad1 = ptrtoint ptr addrspace(6) %ep1 to i32
   call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 11, ptr addrspace(8) %arg2, i32 %ad1, i32 0, i32 0)
 
-  %bs2 = or i32 %bs1, 1
+  %bs2 = or disjoint i32 %bs1, 1
   %ep2 = getelementptr i32, ptr addrspace(6) %arg3, i32 %bs2
   %ad2 = ptrtoint ptr addrspace(6) %ep2 to i32
   call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 22, ptr addrspace(8) %arg2, i32 %ad2, i32 0, i32 0)
 
-  %bs3 = or i32 %bs1, 2
+  %bs3 = or disjoint i32 %bs1, 2
   %ep3 = getelementptr i32, ptr addrspace(6) %arg3, i32 %bs3
   %ad3 = ptrtoint ptr addrspace(6) %ep3 to i32
   call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 33, ptr addrspace(8) %arg2, i32 %ad3, i32 0, i32 0)
 
-  %bs4 = or i32 %bs1, 3
+  %bs4 = or disjoint i32 %bs1, 3
   %ep4 = getelementptr i32, ptr addrspace(6) %arg3, i32 %bs4
   %ad4 = ptrtoint ptr addrspace(6) %ep4 to i32
   call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 44, ptr addrspace(8) %arg2, i32 %ad4, i32 0, i32 0)
@@ -90,17 +90,17 @@ define amdgpu_cs void @test2(i32 %arg1, <4 x i32> inreg %arg2) {
   %ad1 = ptrtoint ptr addrspace(6) %ep1 to i32
   call void @llvm.amdgcn.raw.buffer.store.i32(i32 11, <4 x i32> %arg2, i32 %ad1, i32 0, i32 0)
 
-  %bs2 = or i32 %bs1, 1
+  %bs2 = or disjoint i32 %bs1, 1
   %ep2 = getelementptr <{ [64 x i32] }>, ptr addrspace(6) null, i32 0, i32 0, i32 %bs2
   %ad2 = ptrtoint ptr addrspace(6) %ep2 to i32
   call void @llvm.amdgcn.raw.buffer.store.i32(i32 22, <4 x i32> %arg2, i32 %ad2, i32 0, i32 0)
 
-  %bs3 = or i32 %bs1, 2
+  %bs3 = or disjoint i32 %bs1, 2
   %ep3 = getelementptr <{ [64 x i32] }>, ptr addrspace(6) null, i32 0, i32 0, i32 %bs3
   %ad3 = ptrtoint ptr addrspace(6) %ep3 to i32
   call void @llvm.amdgcn.raw.buffer.store.i32(i32 33, <4 x i32> %arg2, i32 %ad3, i32 0, i32 0)
 
-  %bs4 = or i32 %bs1, 3
+  %bs4 = or disjoint i32 %bs1, 3
   %ep4 = getelementptr <{ [64 x i32] }>, ptr addrspace(6) null, i32 0, i32 0, i32 %bs4
   %ad4 = ptrtoint ptr addrspace(6) %ep4 to i32
   call void @llvm.amdgcn.raw.buffer.store.i32(i32 44, <4 x i32> %arg2, i32 %ad4, i32 0, i32 0)
@@ -125,17 +125,17 @@ define amdgpu_cs void @test2_ptr(i32 %arg1, ptr addrspace(8) inreg %arg2) {
   %ad1 = ptrtoint ptr addrspace(6) %ep1 to i32
   call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 11, ptr addrspace(8) %arg2, i32 %ad1, i32 0, i32 0)
 
-  %bs2 = or i32 %bs1, 1
+  %bs2 = or disjoint i32 %bs1, 1
   %ep2 = getelementptr <{ [64 x i32] }>, ptr addrspace(6) null, i32 0, i32 0, i32 %bs2
   %ad2 = ptrtoint ptr addrspace(6) %ep2 to i32
   call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 22, ptr addrspace(8) %arg2, i32 %ad2, i32 0, i32 0)
 
-  %bs3 = or i32 %bs1, 2
+  %bs3 = or disjoint i32 %bs1, 2
   %ep3 = getelementptr <{ [64 x i32] }>, ptr addrspace(6) null, i32 0, i32 0, i32 %bs3
   %ad3 = ptrtoint ptr addrspace(6) %ep3 to i32
   call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 33, ptr addrspace(8) %arg2, i32 %ad3, i32 0, i32 0)
 
-  %bs4 = or i32 %bs1, 3
+  %bs4 = or disjoint i32 %bs1, 3
   %ep4 = getelementptr <{ [64 x i32] }>, ptr addrspace(6) null, i32 0, i32 0, i32 %bs4
   %ad4 = ptrtoint ptr addrspace(6) %ep4 to i32
   call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 44, ptr addrspace(8) %arg2, i32 %ad4, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
index a89a2bb28b87bb..8cb7d6651a08c2 100644
--- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
@@ -238,7 +238,7 @@ main_body:
   %25 = getelementptr inbounds [0 x <8 x i32>], ptr addrspace(6) %1, i32 0, i32 %24, !amdgpu.uniform !0
   %26 = load <8 x i32>, ptr addrspace(6) %25, align 32, !invariant.load !0
   %27 = shl i32 %23, 2
-  %28 = or i32 %27, 3
+  %28 = or disjoint i32 %27, 3
   %29 = getelementptr inbounds [0 x <4 x i32>], ptr addrspace(6) %1, i32 0, i32 %28, !amdgpu.uniform !0
   %30 = load <4 x i32>, ptr addrspace(6) %29, align 16, !invariant.load !0
   %31 = call nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> %26, <4 x i32> %30, i1 0, i32 0, i32 0) #8
@@ -270,7 +270,7 @@ main_body:
   %25 = getelementptr inbounds [0 x <8 x i32>], ptr addrspace(6) %1, i32 0, i32 %24
   %26 = load <8 x i32>, ptr addrspace(6) %25, align 32, !invariant.load !0
   %27 = shl i32 %23, 2
-  %28 = or i32 %27, 3
+  %28 = or disjoint i32 %27, 3
   %29 = getelementptr inbounds [0 x <4 x i32>], ptr addrspace(6) %1, i32 0, i32 %28
   %30 = load <4 x i32>, ptr addrspace(6) %29, align 16, !invariant.load !0
   %31 = call nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> %26, <4 x i32> %30, i1 0, i32 0, i32 0) #8
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index 4202edfbd0eb45..069c57e2ae63e2 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -1157,38 +1157,38 @@ define <8 x i16> @large_vector(ptr addrspace(3) %p, i32 %idxp) {
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %idx = shl i32 %idxp, 4
 
-  %i.0 = or i32 %idx, 0
+  %i.0 = or disjoint i32 %idx, 0
   %p.0 = getelementptr half, ptr addrspace(3) %p, i32 %i.0
   %x.0 = load i16, ptr addrspace(3) %p.0, align 4
   %v0p = insertelement <8 x i16> poison, i16 %x.0, i32 0
-  %i.1 = or i32 %idx, 1
+  %i.1 = or disjoint i32 %idx, 1
   %p.1 = getelementptr half, ptr addrspace(3) %p, i32 %i.1
   %x.1 = load i16, ptr addrspace(3) %p.1, align 2
   %v0 = insertelement <8 x i16> %v0p, i16 %x.1, i32 1
 
-  %i.2 = or i32 %idx, 2
+  %i.2 = or disjoint i32 %idx, 2
   %p.2 = getelementptr half, ptr addrspace(3) %p, i32 %i.2
   %x.2 = load i16, ptr addrspace(3) %p.2, align 4
   %v1p = insertelement <8 x i16> poison, i16 %x.2, i32 0
-  %i.3 = or i32 %idx, 3
+  %i.3 = or disjoint i32 %idx, 3
   %p.3 = getelementptr half, ptr addrspace(3) %p, i32 %i.3
   %x.3 = load i16, ptr addrspace(3) %p.3, align 2
   %v1 = insertelement <8 x i16> %v1p, i16 %x.3, i32 1
 
-  %i.4 = or i32 %idx, 4
+  %i.4 = or disjoint i32 %idx, 4
   %p.4 = getelementptr half, ptr addrspace(3) %p, i32 %i.4
   %x.4 = load i16, ptr addrspace(3) %p.4, align 4
   %v2p = insertelement <8 x i16> poison, i16 %x.4, i32 0
-  %i.5 = or i32 %idx, 5
+  %i.5 = or disjoint i32 %idx, 5
   %p.5 = getelementptr half, ptr addrspace(3) %p, i32 %i.5
   %x.5 = load i16, ptr addrspace(3) %p.5, align 2
   %v2 = insertelement <8 x i16> %v2p, i16 %x.5, i32 1
 
-  %i.6 = or i32 %idx, 6
+  %i.6 = or disjoint i32 %idx, 6
   %p.6 = getelementptr half, ptr addrspace(3) %p, i32 %i.6
   %x.6 = load i16, ptr addrspace(3) %p.6, align 4
   %v3p = insertelement <8 x i16> poison, i16 %x.6, i32 0
-  %i.7 = or i32 %idx, 7
+  %i.7 = or disjoint i32 %idx, 7
   %p.7 = getelementptr half, ptr addrspace(3) %p, i32 %i.7
   %x.7 = load i16, ptr addrspace(3) %p.7, align 2
   %v3 = insertelement <8 x i16> %v3p, i16 %x.7, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index 0f2b2aa4d3562b..d9f6ce0b4c8513 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -732,7 +732,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
   %192 = and i64 %191, 4294967168
   %193 = getelementptr inbounds i8, ptr addrspace(1) %1, i64 %192
   %194 = shl nuw nsw i32 %178, 5
-  %195 = or i32 %194, 8
+  %195 = or disjoint i32 %194, 8
   %196 = zext i32 %195 to i64
   %197 = getelementptr inbounds i8, ptr addrspace(1) %193, i64 %196
   %198 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 -4
diff --git a/llvm/test/CodeGen/NVPTX/vector-loads.ll b/llvm/test/CodeGen/NVPTX/vector-loads.ll
index 672c313cf5d194..9322b9e0fe6c82 100644
--- a/llvm/test/CodeGen/NVPTX/vector-loads.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-loads.ll
@@ -78,11 +78,11 @@ define void @foo_complex(ptr nocapture readonly align 16 dereferenceable(1342177
   %t3 = shl nuw nsw i32 %t1, 9
   %ttile_origin.2 = and i32 %t3, 130560
   %tstart_offset_x_mul = shl nuw nsw i32 %t0, 1
-  %t4 = or i32 %ttile_origin.2, %tstart_offset_x_mul
-  %t6 = or i32 %t4, 1
-  %t8 = or i32 %t4, 128
+  %t4 = or disjoint i32 %ttile_origin.2, %tstart_offset_x_mul
+  %t6 = or disjoint i32 %t4, 1
+  %t8 = or disjoint i32 %t4, 128
   %t9 = zext i32 %t8 to i64
-  %t10 = or i32 %t4, 129
+  %t10 = or disjoint i32 %t4, 129
   %t11 = zext i32 %t10 to i64
   %t20 = zext i32 %t2 to i64
   %t27 = getelementptr inbounds [1024 x [131072 x i8]], ptr %alloc0, i64 0, i64 %t20, i64 %t9
diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
index 92d67bafd51832..53b0a2737122e1 100644
--- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
@@ -496,7 +496,7 @@ for.body:                                         ; preds = %for.body, %for.body
   %idxprom = zext i32 %mul to i64
   %arrayidx = getelementptr inbounds <16 x i8>, ptr %vc, i64 %idxprom
   %4 = load <16 x i8>, ptr %arrayidx, align 16
-  %add2 = or i32 %mul, 1
+  %add2 = or disjoint i32 %mul, 1
   %idxprom3 = zext i32 %add2 to i64
   %arrayidx4 = getelementptr inbounds <16 x i8>, ptr %vc, i64 %idxprom3
   %5 = load <16 x i8>, ptr %arrayidx4, align 16
diff --git a/llvm/test/CodeGen/PowerPC/sched-addi.ll b/llvm/test/CodeGen/PowerPC/sched-addi.ll
index ce6679ab7bb3d5..65cc47be28ee14 100644
--- a/llvm/test/CodeGen/PowerPC/sched-addi.ll
+++ b/llvm/test/CodeGen/PowerPC/sched-addi.ll
@@ -99,7 +99,7 @@ entry:
 
 vector.body:
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
-   %offset.idx = or i64 %index, 1
+   %offset.idx = or disjoint i64 %index, 1
   %0 = getelementptr %_elem_type_of_x, ptr %x_rvo_based_addr_3, i64 %offset.idx, i32 0
   %1 = getelementptr %_elem_type_of_a, ptr %a_rvo_based_addr_5, i64 %offset.idx, i32 0
   %wide.load = load <4 x double>, ptr %1, align 8
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
index 427681ac724ee5..83e36eba36c842 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
@@ -157,7 +157,7 @@ main_body:
   %25 = getelementptr [0 x <8 x i32>], ptr addrspace(4) %1, i32 0, i32 %24, !amdgpu.uniform !0
   %26 = load <8 x i32>, ptr addrspace(4) %25, align 32, !invariant.load !0
   %27 = shl i32 %23, 2
-  %28 = or i32 %27, 3
+  %28 = or disjoint i32 %27, 3
   %29 = getelementptr [0 x <4 x i32>], ptr addrspace(4) %1, i32 0, i32 %28, !amdgpu.uniform !0
   %30 = load <4 x i32>, ptr addrspace(4) %29, align 16, !invariant.load !0
   %31 = call nsz <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> zeroinitializer, <8 x i32> %26, <4 x i32> %30, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #8
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
index 49c6a46b136d52..c915b9a5e59ac5 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
@@ -142,7 +142,7 @@ define ptr @sext_or(i64 %a, i32 %b) {
 ;
 entry:
   %b1 = shl i32 %b, 2
-  %b2 = or i32 %b1, 1 ; (b << 2) and 1 have no common bits
+  %b2 = or disjoint i32 %b1, 1 ; (b << 2) and 1 have no common bits
   %b3 = or i32 %b1, 4 ; (b << 2) and 4 may have common bits
   %b2.ext = zext i32 %b2 to i64
   %b3.ext = sext i32 %b3 to i64
@@ -335,7 +335,7 @@ define ptr @shl_add_or(i64 %a, ptr %ptr) {
 entry:
   %shl = shl i64 %a, 2
   %add = add i64 %shl, 12
-  %or = or i64 %add, 1
+  %or = or disjoint i64 %add, 1
   ; ((a << 2) + 12) and 1 have no common bits. Therefore,
   ; SeparateConstOffsetFromGEP is able to extract the 12.
   ; TODO(jingyue): We could reassociate the expression to combine 12 and 1.
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/split-gep-or-as-add.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/split-gep-or-as-add.ll
index 5041fed12f5963..e405bbd5347ee0 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/split-gep-or-as-add.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/split-gep-or-as-add.ll
@@ -22,16 +22,17 @@ define void @testOrDoesntSplit(ptr %p) {
   ret void
 }
 
-define void @testNoBitsInCommonOrSplits(ptr %p) {
-; CHECK-LABEL: define void @testNoBitsInCommonOrSplits(
+; COM: The check for `or disjoint` removed the old hasNoBitsInCommon()
+; COM: check, ensure that failing to annotate an or with disjoint makes
+; COM: the optimization fail.
+define void @testNoBitsInCommonOrDoesntSplit(ptr %p) {
+; CHECK-LABEL: define void @testNoBitsInCommonOrDoesntSplit(
 ; CHECK-SAME: ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[VAR:%.*]] = tail call i64 @foo()
 ; CHECK-NEXT:    [[VAR_HIGH:%.*]] = and i64 [[VAR]], -16
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], [[VAR_HIGH]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], 10
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; CHECK-NEXT:    store i8 0, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[OFF:%.*]] = or i64 [[VAR_HIGH]], 10
+; CHECK-NEXT:    [[Q:%.*]] = getelementptr i8, ptr [[P]], i64 [[OFF]]
+; CHECK-NEXT:    store i8 0, ptr [[Q]], align 1
 ; CHECK-NEXT:    ret void
 ;
   %var = tail call i64 @foo()



More information about the cfe-commits mailing list