[llvm] [InferAddressSpaces] Support address space inference from load values (PR #171019)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Dec 7 00:15:38 PST 2025
https://github.com/actinks updated https://github.com/llvm/llvm-project/pull/171019
>From f1ca224d2ca9f0cabfdbf636cee4ed5cda23b82a Mon Sep 17 00:00:00 2001
From: actink <actink at 163.com>
Date: Sun, 7 Dec 2025 11:25:08 +0800
Subject: [PATCH 1/2] precommit
---
.../InferAddressSpaces/NVPTX/load-ptr.ll | 240 ++++++++++++++++++
1 file changed, 240 insertions(+)
create mode 100644 llvm/test/Transforms/InferAddressSpaces/NVPTX/load-ptr.ll
diff --git a/llvm/test/Transforms/InferAddressSpaces/NVPTX/load-ptr.ll b/llvm/test/Transforms/InferAddressSpaces/NVPTX/load-ptr.ll
new file mode 100644
index 0000000000000..fb3f55ab89497
--- /dev/null
+++ b/llvm/test/Transforms/InferAddressSpaces/NVPTX/load-ptr.ll
@@ -0,0 +1,240 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=infer-address-spaces %s | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+define ptx_kernel void @globalmem_flat_ptr_with_global(ptr %a, ptr %b){
+; CHECK-LABEL: define ptx_kernel void @globalmem_flat_ptr_with_global(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(1) [[TMP0]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[TMP3]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP1]], i64 [[IDXPROM]]
+; CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = load ptr, ptr %a, align 8
+ %1 = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+ %idxprom = zext nneg i32 %1 to i64
+ %arrayidx = getelementptr inbounds nuw i32, ptr %0, i64 %idxprom
+ %2 = load i32, ptr %arrayidx, align 4
+ %arrayidx3 = getelementptr inbounds nuw i32, ptr %b, i64 %idxprom
+ store i32 %2, ptr %arrayidx3, align 4
+ ret void
+}
+
+ at shared_ptrs = internal unnamed_addr addrspace(3) global [32 x ptr] undef, align 8
+
+define ptx_kernel void @sharedmem_flat_ptr_with_global(ptr %a, ptr %b) {
+; CHECK-LABEL: define ptx_kernel void @sharedmem_flat_ptr_with_global(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr
+; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP3:%.*]] = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[TMP3]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr addrspace(3) @shared_ptrs, i64 [[IDXPROM]]
+; CHECK-NEXT: store ptr [[ARRAYIDX]], ptr addrspace(3) [[ARRAYIDX3]], align 8
+; CHECK-NEXT: tail call void @llvm.nvvm.bar.warp.sync(i32 -1)
+; CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr addrspace(3) [[ARRAYIDX3]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP2]], i64 [[IDXPROM]]
+; CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[ARRAYIDX9]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+ %idxprom = zext nneg i32 %0 to i64
+ %arrayidx = getelementptr inbounds nuw i32, ptr %a, i64 %idxprom
+ %arrayidx3 = getelementptr inbounds nuw ptr, ptr addrspacecast (ptr addrspace(3) @shared_ptrs to ptr), i64 %idxprom
+ store ptr %arrayidx, ptr %arrayidx3, align 8
+ tail call void @llvm.nvvm.bar.warp.sync(i32 -1)
+ %1 = load ptr, ptr %arrayidx3, align 8
+ %2 = load i32, ptr %1, align 4
+ %arrayidx9 = getelementptr inbounds nuw i32, ptr %b, i64 %idxprom
+ store i32 %2, ptr %arrayidx9, align 4
+ ret void
+}
+
+ at a = dso_local addrspace(1) externally_initialized global ptr null, align 8
+ at llvm.compiler.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @a to ptr)], section "llvm.metadata"
+
+define dso_local ptx_kernel void @device_var_with_global(ptr %b) {
+; CHECK-LABEL: define dso_local ptx_kernel void @device_var_with_global(
+; CHECK-SAME: ptr [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr addrspace(1) @a, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[TMP3]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP0]], i64 [[IDXPROM]]
+; CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = load ptr, ptr addrspacecast (ptr addrspace(1) @a to ptr), align 8
+ %1 = load ptr, ptr %0, align 8
+ %2 = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+ %idxprom = zext nneg i32 %2 to i64
+ %arrayidx = getelementptr inbounds nuw i32, ptr %1, i64 %idxprom
+ %3 = load i32, ptr %arrayidx, align 4
+ %arrayidx3 = getelementptr inbounds nuw i32, ptr %b, i64 %idxprom
+ store i32 %3, ptr %arrayidx3, align 4
+ ret void
+}
+
+
+define ptx_kernel void @globalmem_flat_ptr_with_global_clobber(ptr %a, ptr %b) {
+; CHECK-LABEL: define ptx_kernel void @globalmem_flat_ptr_with_global_clobber(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(1) [[TMP0]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[TMP4]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[TMP1]], i64 [[IDXPROM]]
+; CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(1) [[ARRAYIDX]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP2]], align 4
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 4
+; CHECK-NEXT: store i32 [[TMP5]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = load ptr, ptr %a, align 8
+ %1 = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+ %idxprom = zext nneg i32 %1 to i64
+ %arrayidx = getelementptr inbounds nuw ptr, ptr %b, i64 %idxprom
+ ; 1 = MemoryDef(liveOnEntry)
+ store ptr %0, ptr %arrayidx, align 8
+ ; MemoryUse(1)
+ %2 = load i32, ptr %0, align 4
+ %arrayidx4 = getelementptr inbounds nuw i8, ptr %0, i64 4
+ ; 2 = MemoryDef(1)
+ store i32 %2, ptr %arrayidx4, align 4
+ ret void
+}
+
+
+ at s_int2 = internal addrspace(3) global [2 x i32] undef, align 4
+
+; Function Attrs: convergent mustprogress noinline norecurse nounwind
+define dso_local ptx_kernel void @phi_clobber_with_diff_as(ptr %a, ptr %b) {
+; CHECK-LABEL: define dso_local ptx_kernel void @phi_clobber_with_diff_as(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(1)
+; CHECK-NEXT: store i32 0, ptr addrspace(3) @s_int2, align 4
+; CHECK-NEXT: store i32 0, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @s_int2, i64 4), align 4
+; CHECK-NEXT: tail call void @llvm.nvvm.bar.warp.sync(i32 -1)
+; CHECK-NEXT: [[TMP2:%.*]] = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[TMP2]], 15
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[TMP2]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[TMP0]], i64 [[IDXPROM]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[ENTRY_IF_END_CRIT_EDGE:.*]]
+; CHECK: [[ENTRY_IF_END_CRIT_EDGE]]:
+; CHECK-NEXT: [[DOTPRE:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX]], align 8
+; CHECK-NEXT: br label %[[IF_END:.*]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: store ptr addrspacecast (ptr addrspace(3) @s_int2 to ptr), ptr addrspace(1) [[ARRAYIDX]], align 8
+; CHECK-NEXT: br label %[[IF_END]]
+; CHECK: [[IF_END]]:
+; CHECK-NEXT: [[TMP3:%.*]] = phi ptr [ [[DOTPRE]], %[[ENTRY_IF_END_CRIT_EDGE]] ], [ addrspacecast (ptr addrspace(3) @s_int2 to ptr), %[[IF_THEN]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP1]], i64 [[IDXPROM]]
+; CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX7]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ store i32 0, ptr addrspacecast (ptr addrspace(3) @s_int2 to ptr), align 4
+ store i32 0, ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @s_int2 to ptr), i64 4), align 4
+ tail call void @llvm.nvvm.bar.warp.sync(i32 -1)
+ %0 = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+ %cmp = icmp samesign ugt i32 %0, 15
+ %idxprom = zext nneg i32 %0 to i64
+ %arrayidx = getelementptr inbounds nuw ptr, ptr %a, i64 %idxprom
+ br i1 %cmp, label %if.then, label %entry.if.end_crit_edge
+
+entry.if.end_crit_edge: ; preds = %entry
+ %.pre = load ptr, ptr %arrayidx, align 8
+ br label %if.end
+
+if.then: ; preds = %entry
+ store ptr addrspacecast (ptr addrspace(3) @s_int2 to ptr), ptr %arrayidx, align 8
+ br label %if.end
+
+if.end: ; preds = %entry.if.end_crit_edge, %if.then
+ %1 = phi ptr [ %.pre, %entry.if.end_crit_edge ], [ addrspacecast (ptr addrspace(3) @s_int2 to ptr), %if.then ]
+ %2 = load i32, ptr %1, align 4
+ %arrayidx7 = getelementptr inbounds nuw i32, ptr %b, i64 %idxprom
+ store i32 %2, ptr %arrayidx7, align 4
+ ret void
+}
+
+define ptx_kernel void @phi_clobber_with_same_as(ptr %a, ptr %b) {
+; CHECK-LABEL: define ptx_kernel void @phi_clobber_with_same_as(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(1)
+; CHECK-NEXT: store i32 0, ptr addrspace(3) @s_int2, align 4
+; CHECK-NEXT: store i32 0, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @s_int2, i64 4), align 4
+; CHECK-NEXT: tail call void @llvm.nvvm.bar.warp.sync(i32 -1)
+; CHECK-NEXT: [[TMP2:%.*]] = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i32 [[TMP2]], 15
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[TMP2]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[TMP0]], i64 [[IDXPROM]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[ENTRY_IF_END_CRIT_EDGE:.*]]
+; CHECK: [[ENTRY_IF_END_CRIT_EDGE]]:
+; CHECK-NEXT: store ptr addrspacecast (ptr addrspace(3) @s_int2 to ptr), ptr addrspace(1) [[ARRAYIDX]], align 8
+; CHECK-NEXT: br label %[[IF_END:.*]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: store ptr addrspacecast (ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @s_int2, i64 4) to ptr), ptr addrspace(1) [[ARRAYIDX]], align 8
+; CHECK-NEXT: br label %[[IF_END]]
+; CHECK: [[IF_END]]:
+; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP1]], i64 [[IDXPROM]]
+; CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX7]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ store i32 0, ptr addrspacecast (ptr addrspace(3) @s_int2 to ptr), align 4
+ store i32 0, ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @s_int2 to ptr), i64 4), align 4
+ tail call void @llvm.nvvm.bar.warp.sync(i32 -1)
+ %0 = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+ %cmp = icmp samesign ugt i32 %0, 15
+ %idxprom = zext nneg i32 %0 to i64
+ %arrayidx = getelementptr inbounds nuw ptr, ptr %a, i64 %idxprom
+ br i1 %cmp, label %if.then, label %entry.if.end_crit_edge
+
+entry.if.end_crit_edge: ; preds = %entry
+ store ptr addrspacecast (ptr addrspace(3) @s_int2 to ptr), ptr %arrayidx, align 8
+ br label %if.end
+
+if.then: ; preds = %entry
+ store ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @s_int2 to ptr), i64 4), ptr %arrayidx, align 8
+ br label %if.end
+
+if.end: ; preds = %entry.if.end_crit_edge, %if.then
+ %1 = load ptr, ptr %arrayidx, align 8
+ %2 = load i32, ptr %1, align 4
+ %arrayidx7 = getelementptr inbounds nuw i32, ptr %b, i64 %idxprom
+ store i32 %2, ptr %arrayidx7, align 4
+ ret void
+}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+declare void @llvm.nvvm.bar.warp.sync(i32)
>From d0c84d393b25f9da67a83cb717205228918db61a Mon Sep 17 00:00:00 2001
From: actink <actink at 163.com>
Date: Wed, 3 Dec 2025 11:21:43 +0800
Subject: [PATCH 2/2] [InferAddressSpaces] Support address space inference from
load values
---
.../llvm/Analysis/TargetTransformInfo.h | 6 +
.../llvm/Analysis/TargetTransformInfoImpl.h | 6 +
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 6 +
llvm/include/llvm/Target/TargetMachine.h | 8 +
llvm/lib/Analysis/TargetTransformInfo.cpp | 9 +
.../AMDGPU/AMDGPUPromoteKernelArguments.cpp | 37 +---
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 31 +++-
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h | 2 +
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 22 +++
.../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 2 +
.../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 48 ++++++
.../Target/NVPTX/NVPTXTargetTransformInfo.h | 5 +
.../Transforms/Scalar/InferAddressSpaces.cpp | 160 +++++++++++++++++-
.../AMDGPU/promote-kernel-arguments.ll | 2 +-
.../AMDGPU/infer-address-space.ll | 4 +-
.../InferAddressSpaces/NVPTX/load-ptr.ll | 37 ++--
16 files changed, 322 insertions(+), 63 deletions(-)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 99525607f744a..efb352018fbe4 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -563,6 +563,8 @@ class TargetTransformInfo {
LLVM_ABI unsigned getAssumedAddrSpace(const Value *V) const;
+ LLVM_ABI unsigned getAssumedLiveOnEntryDefAddrSpace(const Value *V) const;
+
LLVM_ABI bool isSingleThreaded() const;
LLVM_ABI std::pair<const Value *, unsigned>
@@ -577,6 +579,10 @@ class TargetTransformInfo {
Value *OldV,
Value *NewV) const;
+ /// Return true if \p IID only performs an artificial clobber to facilitate
+ /// ordering constraints.
+ LLVM_ABI bool isArtificialClobber(Intrinsic::ID IID) const;
+
/// Test whether calls to a function lower to actual program function
/// calls.
///
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 835eb7701ccfa..0130b5225ce3f 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -158,6 +158,10 @@ class TargetTransformInfoImplBase {
virtual unsigned getAssumedAddrSpace(const Value *V) const { return -1; }
+ virtual unsigned getAssumedLiveOnEntryDefAddrSpace(const Value *V) const {
+ return -1;
+ }
+
virtual bool isSingleThreaded() const { return false; }
virtual std::pair<const Value *, unsigned>
@@ -171,6 +175,8 @@ class TargetTransformInfoImplBase {
return nullptr;
}
+ virtual bool isArtificialClobber(Intrinsic::ID IID) const { return false; }
+
virtual bool isLoweredToCall(const Function *F) const {
assert(F && "A concrete function must be provided to this routine.");
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 494199835a19c..10708245b1180 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -440,6 +440,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
}
+ unsigned getAssumedLiveOnEntryDefAddrSpace(const Value *V) const override {
+ return getTLI()->getTargetMachine().getAssumedLiveOnEntryDefAddrSpace(V);
+ }
+
bool isSingleThreaded() const override {
return getTLI()->getTargetMachine().Options.ThreadModel ==
ThreadModel::Single;
@@ -455,6 +459,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return nullptr;
}
+ bool isArtificialClobber(Intrinsic::ID IID) const override { return false; }
+
bool isLegalAddImmediate(int64_t imm) const override {
return getTLI()->isLegalAddImmediate(imm);
}
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index d0fd483a8ddaa..03e0b43686cd4 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -378,6 +378,14 @@ class LLVM_ABI TargetMachine {
/// properties.
virtual unsigned getAssumedAddrSpace(const Value *V) const { return -1; }
+ /// LiveOnEntryDef same as MemorySSA's concept.
+ /// Loads and stores from pointer arguments and other global values may be
+ /// defined by memory operations that do not occur in the current function.
+ /// Return the assumed address space for such memory operations.
+ virtual unsigned getAssumedLiveOnEntryDefAddrSpace(const Value *V) const {
+ return -1;
+ }
+
/// If the specified predicate checks whether a generic pointer falls within
/// a specified address space, return that generic pointer and the address
/// space being queried.
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index c529d87502acd..d943c2171d6a8 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -339,6 +339,11 @@ unsigned TargetTransformInfo::getAssumedAddrSpace(const Value *V) const {
return TTIImpl->getAssumedAddrSpace(V);
}
+unsigned
+TargetTransformInfo::getAssumedLiveOnEntryDefAddrSpace(const Value *V) const {
+ return TTIImpl->getAssumedLiveOnEntryDefAddrSpace(V);
+}
+
bool TargetTransformInfo::isSingleThreaded() const {
return TTIImpl->isSingleThreaded();
}
@@ -353,6 +358,10 @@ Value *TargetTransformInfo::rewriteIntrinsicWithAddressSpace(
return TTIImpl->rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
}
+bool TargetTransformInfo::isArtificialClobber(Intrinsic::ID IID) const {
+ return TTIImpl->isArtificialClobber(IID);
+}
+
bool TargetTransformInfo::isLoweredToCall(const Function *F) const {
return TTIImpl->isLoweredToCall(F);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
index 06819d05b4be6..8ec9a1b15e6a5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
@@ -34,8 +34,6 @@ class AMDGPUPromoteKernelArguments : public FunctionPass {
AliasAnalysis *AA;
- Instruction *ArgCastInsertPt;
-
SmallVector<Value *> Ptrs;
void enqueueUsers(Value *Ptr);
@@ -107,24 +105,7 @@ bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) {
PT->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS)
enqueueUsers(Ptr);
- if (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
- return Changed;
-
- IRBuilder<> B(LI ? &*std::next(cast<Instruction>(Ptr)->getIterator())
- : ArgCastInsertPt);
-
- // Cast pointer to global address space and back to flat and let
- // Infer Address Spaces pass to do all necessary rewriting.
- PointerType *NewPT =
- PointerType::get(PT->getContext(), AMDGPUAS::GLOBAL_ADDRESS);
- Value *Cast =
- B.CreateAddrSpaceCast(Ptr, NewPT, Twine(Ptr->getName(), ".global"));
- Value *CastBack =
- B.CreateAddrSpaceCast(Cast, PT, Twine(Ptr->getName(), ".flat"));
- Ptr->replaceUsesWithIf(CastBack,
- [Cast](Use &U) { return U.getUser() != Cast; });
-
- return true;
+ return Changed;
}
bool AMDGPUPromoteKernelArguments::promoteLoad(LoadInst *LI) {
@@ -135,21 +116,6 @@ bool AMDGPUPromoteKernelArguments::promoteLoad(LoadInst *LI) {
return true;
}
-// skip allocas
-static BasicBlock::iterator getInsertPt(BasicBlock &BB) {
- BasicBlock::iterator InsPt = BB.getFirstInsertionPt();
- for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) {
- AllocaInst *AI = dyn_cast<AllocaInst>(&*InsPt);
-
- // If this is a dynamic alloca, the value may depend on the loaded kernargs,
- // so loads will need to be inserted before it.
- if (!AI || !AI->isStaticAlloca())
- break;
- }
-
- return InsPt;
-}
-
bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA,
AliasAnalysis &AA) {
if (skipFunction(F))
@@ -159,7 +125,6 @@ bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA,
if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
return false;
- ArgCastInsertPt = &*getInsertPt(*F.begin());
this->MSSA = &MSSA;
this->AA = &AA;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e5a35abe6da6b..fbda0196b4617 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1042,13 +1042,38 @@ unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
assert(V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
const auto *Ptr = LD->getPointerOperand();
- if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
- return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
+
// For a generic pointer loaded from the constant memory, it could be assumed
// as a global pointer since the constant memory is only populated on the
// host side. As implied by the offload programming model, only global
// pointers could be referenced on the host side.
- return AMDGPUAS::GLOBAL_ADDRESS;
+ if (Ptr->getType()->getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS)
+ return AMDGPUAS::GLOBAL_ADDRESS;
+
+ // For a generic pointer loaded from the readonly and noalias arg, same as
+ // above.
+ if (const Argument *Arg = dyn_cast<Argument>(getUnderlyingObject(Ptr)))
+ if (AMDGPU::isModuleEntryFunctionCC(Arg->getParent()->getCallingConv()) &&
+ Arg->onlyReadsMemory() && Arg->hasNoAliasAttr())
+ return AMDGPUAS::GLOBAL_ADDRESS;
+
+ return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
+}
+
+unsigned
+AMDGPUTargetMachine::getAssumedLiveOnEntryDefAddrSpace(const Value *V) const {
+ if (const Instruction *I = dyn_cast<Instruction>(V)) {
+ if (AMDGPU::isModuleEntryFunctionCC(
+ I->getParent()->getParent()->getCallingConv()))
+ return AMDGPUAS::GLOBAL_ADDRESS;
+ }
+ if (const LoadInst *LD = dyn_cast<LoadInst>(V)) {
+ // same as getAssumedAddrSpace
+ if (LD->getPointerOperandType()->getPointerAddressSpace() ==
+ AMDGPUAS::CONSTANT_ADDRESS)
+ return AMDGPUAS::GLOBAL_ADDRESS;
+ }
+ return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
}
std::pair<const Value *, unsigned>
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 06a3047196b8a..ea21c095faf75 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -62,6 +62,8 @@ class AMDGPUTargetMachine : public CodeGenTargetMachineImpl {
unsigned getAssumedAddrSpace(const Value *V) const override;
+ unsigned getAssumedLiveOnEntryDefAddrSpace(const Value *V) const override;
+
std::pair<const Value *, unsigned>
getPredicatedAddrSpace(const Value *V) const override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index dfa21515838ff..a151b0c3989fa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1223,6 +1223,28 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
}
}
+bool GCNTTIImpl::isArtificialClobber(Intrinsic::ID IID) const {
+ switch (IID) {
+ case Intrinsic::amdgcn_s_barrier:
+ case Intrinsic::amdgcn_s_cluster_barrier:
+ case Intrinsic::amdgcn_s_barrier_signal:
+ case Intrinsic::amdgcn_s_barrier_signal_var:
+ case Intrinsic::amdgcn_s_barrier_signal_isfirst:
+ case Intrinsic::amdgcn_s_barrier_init:
+ case Intrinsic::amdgcn_s_barrier_join:
+ case Intrinsic::amdgcn_s_barrier_wait:
+ case Intrinsic::amdgcn_s_barrier_leave:
+ case Intrinsic::amdgcn_s_get_barrier_state:
+ case Intrinsic::amdgcn_wave_barrier:
+ case Intrinsic::amdgcn_sched_barrier:
+ case Intrinsic::amdgcn_sched_group_barrier:
+ case Intrinsic::amdgcn_iglp_opt:
+ return true;
+ default:
+ return false;
+ }
+}
+
InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *DstTy, VectorType *SrcTy,
ArrayRef<int> Mask,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 20da8344c9d37..12be42c16d025 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -210,6 +210,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
Value *NewV) const override;
+ bool isArtificialClobber(Intrinsic::ID IID) const override;
+
bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0,
const Value *Op1, InstCombiner &IC) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 5d5553c573b0f..c61aae8335aa4 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -592,6 +592,32 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
return nullptr;
}
+bool NVPTXTTIImpl::isArtificialClobber(Intrinsic::ID IID) const {
+ switch (IID) {
+ case Intrinsic::nvvm_bar_warp_sync:
+ case Intrinsic::nvvm_barrier_cluster_arrive:
+ case Intrinsic::nvvm_barrier_cluster_arrive_aligned:
+ case Intrinsic::nvvm_barrier_cluster_arrive_relaxed:
+ case Intrinsic::nvvm_barrier_cluster_arrive_relaxed_aligned:
+ case Intrinsic::nvvm_barrier_cluster_wait:
+ case Intrinsic::nvvm_barrier_cluster_wait_aligned:
+ case Intrinsic::nvvm_barrier_cta_arrive_aligned_count:
+ case Intrinsic::nvvm_barrier_cta_arrive_count:
+ case Intrinsic::nvvm_barrier_cta_sync_aligned_all:
+ case Intrinsic::nvvm_barrier_cta_sync_aligned_count:
+ case Intrinsic::nvvm_barrier_cta_sync_all:
+ case Intrinsic::nvvm_barrier_cta_sync_count:
+ case Intrinsic::nvvm_barrier0_and:
+ case Intrinsic::nvvm_barrier0_or:
+ case Intrinsic::nvvm_barrier0_popc:
+ case Intrinsic::nvvm_membar_cta:
+ case Intrinsic::nvvm_membar_gl:
+ case Intrinsic::nvvm_membar_sys:
+ return true;
+ default:
+ return false;
+ }
+}
bool NVPTXTTIImpl::isLegalMaskedStore(Type *DataTy, Align Alignment,
unsigned AddrSpace,
TTI::MaskKind MaskKind) const {
@@ -657,6 +683,28 @@ unsigned NVPTXTTIImpl::getAssumedAddrSpace(const Value *V) const {
}
}
+ if (const auto *LD = dyn_cast<LoadInst>(V)) {
+ // It must be a generic pointer loaded.
+ assert(V->getType()->getPointerAddressSpace() == ADDRESS_SPACE_GENERIC);
+
+ // For a generic pointer loaded from the readonly and noalias arg, it could
+ // be assumed as a global pointer since the readonly memory is only
+ // populated on the host side.
+ if (const Argument *Arg =
+ dyn_cast<Argument>(getUnderlyingObject(LD->getPointerOperand())))
+ if (isKernelFunction(*Arg->getParent()) && Arg->onlyReadsMemory() &&
+ Arg->hasNoAliasAttr())
+ return ADDRESS_SPACE_GLOBAL;
+ }
+ return -1;
+}
+
+unsigned NVPTXTTIImpl::getAssumedLiveOnEntryDefAddrSpace(const Value *V) const {
+ if (const Instruction *I = dyn_cast<Instruction>(V)) {
+ if (isKernelFunction(*I->getParent()->getParent())) {
+ return ADDRESS_SPACE_GLOBAL;
+ }
+ }
return -1;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index d7f4e1da4073b..e1cab29df4c1d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -191,8 +191,13 @@ class NVPTXTTIImpl final : public BasicTTIImplBase<NVPTXTTIImpl> {
Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
Value *NewV) const override;
+
+ bool isArtificialClobber(Intrinsic::ID IID) const override;
+
unsigned getAssumedAddrSpace(const Value *V) const override;
+ unsigned getAssumedLiveOnEntryDefAddrSpace(const Value *V) const override;
+
void collectKernelLaunchBounds(
const Function &F,
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override;
diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 352a1b331001a..594ee6d1792e2 100644
--- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -94,7 +94,9 @@
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/BasicBlock.h"
@@ -176,6 +178,8 @@ class InferAddressSpaces : public FunctionPass {
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<MemorySSAWrapperPass>();
}
bool runOnFunction(Function &F) override;
@@ -186,8 +190,9 @@ class InferAddressSpacesImpl {
Function *F = nullptr;
const DominatorTree *DT = nullptr;
const TargetTransformInfo *TTI = nullptr;
+ MemorySSA *MSSA = nullptr;
+ mutable BatchAAResults BatchAA;
const DataLayout *DL = nullptr;
-
/// Target specific address space which uses of should be replaced if
/// possible.
unsigned FlatAddrSpace = 0;
@@ -245,11 +250,19 @@ class InferAddressSpacesImpl {
unsigned getPredicatedAddrSpace(const Value &PtrV,
const Value *UserCtx) const;
+ unsigned
+ getLoadPtrAddrSpaceImpl(const LoadInst *LI, unsigned NewAS, MemoryAccess *MA,
+ ValueToAddrSpaceMapTy &InferredAddrSpace,
+ SmallPtrSet<MemoryAccess *, 8> Visited) const;
+ unsigned getLoadPtrAddrSpace(const LoadInst *LI,
+ ValueToAddrSpaceMapTy &InferredAddrSpace) const;
public:
InferAddressSpacesImpl(AssumptionCache &AC, const DominatorTree *DT,
- const TargetTransformInfo *TTI, unsigned FlatAddrSpace)
- : AC(AC), DT(DT), TTI(TTI), FlatAddrSpace(FlatAddrSpace) {}
+ const TargetTransformInfo *TTI, MemorySSA *MSSA,
+ AliasAnalysis *AA, unsigned FlatAddrSpace)
+ : AC(AC), DT(DT), TTI(TTI), MSSA(MSSA), BatchAA(*AA),
+ FlatAddrSpace(FlatAddrSpace) {}
bool run(Function &F);
};
@@ -261,6 +274,8 @@ INITIALIZE_PASS_BEGIN(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces",
false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
INITIALIZE_PASS_END(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces",
false, false)
@@ -327,6 +342,9 @@ static bool isAddressExpression(const Value &V, const DataLayout &DL,
case Instruction::AddrSpaceCast:
case Instruction::GetElementPtr:
return true;
+ case Instruction::Load:
+ return TTI->getAssumedLiveOnEntryDefAddrSpace(&V) !=
+ UninitializedAddressSpace;
case Instruction::Select:
return Op->getType()->isPtrOrPtrVectorTy();
case Instruction::Call: {
@@ -360,6 +378,8 @@ getPointerOperands(const Value &V, const DataLayout &DL,
case Instruction::AddrSpaceCast:
case Instruction::GetElementPtr:
return {Op.getOperand(0)};
+ case Instruction::Load:
+ return {};
case Instruction::Select:
return {Op.getOperand(1), Op.getOperand(2)};
case Instruction::Call: {
@@ -561,9 +581,11 @@ InferAddressSpacesImpl::collectFlatAddressExpressions(Function &F) const {
PushPtrOperand(GEP->getPointerOperand());
} else if (auto *LI = dyn_cast<LoadInst>(&I))
PushPtrOperand(LI->getPointerOperand());
- else if (auto *SI = dyn_cast<StoreInst>(&I))
+ else if (auto *SI = dyn_cast<StoreInst>(&I)) {
PushPtrOperand(SI->getPointerOperand());
- else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I))
+ if (SI->getValueOperand()->getType()->isPtrOrPtrVectorTy())
+ PushPtrOperand(SI->getValueOperand());
+ } else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I))
PushPtrOperand(RMW->getPointerOperand());
else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I))
PushPtrOperand(CmpX->getPointerOperand());
@@ -900,6 +922,14 @@ Value *InferAddressSpacesImpl::cloneValueWithNewAddressSpace(
return NewI;
}
+ if (auto *LD = dyn_cast<LoadInst>(V)) {
+ Type *NewPtrTy = getPtrOrVecOfPtrsWithNewAS(LD->getType(), NewAddrSpace);
+ auto *NewI = new AddrSpaceCastInst(V, NewPtrTy);
+ NewI->insertAfter(LD->getIterator());
+ NewI->setDebugLoc(LD->getDebugLoc());
+ return NewI;
+ }
+
if (Instruction *I = dyn_cast<Instruction>(V)) {
Value *NewV = cloneInstructionWithNewAddressSpace(
I, NewAddrSpace, ValueWithNewAddrSpace, PredicatedAS, PoisonUsesToFix);
@@ -1027,6 +1057,117 @@ InferAddressSpacesImpl::getPredicatedAddrSpace(const Value &Ptr,
return UninitializedAddressSpace;
}
+static bool isReallyAClobber(const Value *Ptr, MemoryDef *Def,
+ BatchAAResults *AA,
+ const TargetTransformInfo *TTI) {
+ Instruction *DI = Def->getMemoryInst();
+
+ if (auto *CB = dyn_cast<CallBase>(DI);
+ CB && CB->onlyAccessesInaccessibleMemory())
+ return false;
+
+ if (isa<FenceInst>(DI))
+ return false;
+
+ if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DI);
+ II && TTI->isArtificialClobber(II->getIntrinsicID())) {
+ return false;
+ }
+
+ // Ignore atomics not aliasing with the original load, any atomic is a
+ // universal MemoryDef from MSSA's point of view too, just like a fence.
+ const auto checkNoAlias = [AA, Ptr](auto I) -> bool {
+ return I && AA->isNoAlias(MemoryLocation::get(dyn_cast<Instruction>(
+ I->getPointerOperand())),
+ MemoryLocation::get(dyn_cast<LoadInst>(Ptr)));
+ };
+
+ if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DI)) ||
+ checkNoAlias(dyn_cast<AtomicRMWInst>(DI)))
+ return false;
+
+ return true;
+}
+
+unsigned InferAddressSpacesImpl::getLoadPtrAddrSpaceImpl(
+ const LoadInst *LI, unsigned AS, MemoryAccess *MA,
+ ValueToAddrSpaceMapTy &InferredAddrSpace,
+ SmallPtrSet<MemoryAccess *, 8> Visited) const {
+ MemorySSAWalker *Walker = MSSA->getWalker();
+ MemoryLocation Loc(MemoryLocation::get(LI));
+
+ if (MSSA->isLiveOnEntryDef(MA))
+ return TTI->getAssumedLiveOnEntryDefAddrSpace(LI);
+
+ if (!Visited.insert(MA).second)
+ return AS;
+
+ if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) {
+ LLVM_DEBUG(dbgs() << " Def: " << *Def->getMemoryInst() << '\n');
+
+ if (!isReallyAClobber(LI->getPointerOperand(), Def, &BatchAA, TTI))
+ return getLoadPtrAddrSpaceImpl(
+ LI, AS,
+ Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc),
+ InferredAddrSpace, Visited);
+
+ LLVM_DEBUG(dbgs() << " -> load is clobbered\n");
+ Instruction *DI = Def->getMemoryInst();
+
+ StoreInst *SI = dyn_cast<StoreInst>(DI);
+
+ // TODO: handle other memory writing instructions
+ if (!SI)
+ return FlatAddrSpace;
+
+ Type *ValType = SI->getValueOperand()->getType();
+ unsigned ValAS = FlatAddrSpace;
+ auto I = InferredAddrSpace.find(SI->getValueOperand());
+ if (I != InferredAddrSpace.end())
+ ValAS = I->second;
+ else if (ValType->isPtrOrPtrVectorTy())
+ ValAS = ValType->getPointerAddressSpace();
+
+ AS = joinAddressSpaces(AS, ValAS);
+
+ if (AS == FlatAddrSpace)
+ return FlatAddrSpace;
+
+ if (BatchAA.isMustAlias(Loc, MemoryLocation::get(SI))) {
+ LLVM_DEBUG(dbgs() << " -> must alias with store: " << *SI << "\n");
+ return AS;
+ }
+
+ return getLoadPtrAddrSpaceImpl(
+ LI, AS,
+ Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc),
+ InferredAddrSpace, Visited);
+ }
+
+ const MemoryPhi *Phi = cast<MemoryPhi>(MA);
+ for (const auto &Use : Phi->incoming_values()) {
+ AS = getLoadPtrAddrSpaceImpl(LI, AS, cast<MemoryAccess>(&Use),
+ InferredAddrSpace, Visited);
+ if (AS == FlatAddrSpace)
+ return FlatAddrSpace;
+ }
+
+ return AS;
+}
+
+unsigned InferAddressSpacesImpl::getLoadPtrAddrSpace(
+ const LoadInst *LI, ValueToAddrSpaceMapTy &InferredAddrSpace) const {
+ if (TTI->getAssumedLiveOnEntryDefAddrSpace(LI) == UninitializedAddressSpace)
+ return UninitializedAddressSpace;
+
+ SmallPtrSet<MemoryAccess *, 8> Visited;
+ LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *LI << '\n');
+ return getLoadPtrAddrSpaceImpl(
+ LI, UninitializedAddressSpace,
+ MSSA->getWalker()->getClobberingMemoryAccess(LI), InferredAddrSpace,
+ Visited);
+}
+
bool InferAddressSpacesImpl::updateAddressSpace(
const Value &V, ValueToAddrSpaceMapTy &InferredAddrSpace,
PredicatedAddrSpaceMapTy &PredicatedAS) const {
@@ -1045,6 +1186,8 @@ bool InferAddressSpacesImpl::updateAddressSpace(
if (AS != UninitializedAddressSpace) {
// Use the assumed address space directly.
NewAS = AS;
+ } else if (auto *LD = dyn_cast<LoadInst>(&V)) {
+ NewAS = getLoadPtrAddrSpace(LD, InferredAddrSpace);
} else {
// Otherwise, infer the address space from its pointer operands.
SmallVector<Constant *, 2> ConstantPtrOps;
@@ -1455,7 +1598,8 @@ bool InferAddressSpaces::runOnFunction(Function &F) {
return InferAddressSpacesImpl(
getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F), DT,
&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
- FlatAddrSpace)
+ &getAnalysis<MemorySSAWrapperPass>().getMSSA(),
+ &getAnalysis<AAResultsWrapperPass>().getAAResults(), FlatAddrSpace)
.run(F);
}
@@ -1473,7 +1617,9 @@ PreservedAnalyses InferAddressSpacesPass::run(Function &F,
bool Changed =
InferAddressSpacesImpl(AM.getResult<AssumptionAnalysis>(F),
AM.getCachedResult<DominatorTreeAnalysis>(F),
- &AM.getResult<TargetIRAnalysis>(F), FlatAddrSpace)
+ &AM.getResult<TargetIRAnalysis>(F),
+ &AM.getResult<MemorySSAAnalysis>(F).getMSSA(),
+ &AM.getResult<AAManager>(F), FlatAddrSpace)
.run(F);
if (Changed) {
PreservedAnalyses PA;
diff --git a/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll b/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll
index 0696cbe5aa891..f68964a96d67a 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll
@@ -81,8 +81,8 @@ entry:
define amdgpu_kernel void @flat_ptr_arg(ptr nocapture readonly noalias %Arg, ptr nocapture noalias %Out, i32 %X) {
; CHECK-LABEL: @flat_ptr_arg(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[OUT_GLOBAL:%.*]] = addrspacecast ptr [[OUT:%.*]] to ptr addrspace(1)
; CHECK-NEXT: [[ARG_GLOBAL:%.*]] = addrspacecast ptr [[ARG:%.*]] to ptr addrspace(1)
+; CHECK-NEXT: [[OUT_GLOBAL:%.*]] = addrspacecast ptr [[OUT:%.*]] to ptr addrspace(1)
; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I]] to i64
; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG_GLOBAL]], i64 [[IDXPROM]]
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll
index a08110defc8b3..caf7c7abbeab7 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll
@@ -159,6 +159,8 @@ define amdgpu_kernel void @loop_with_generic_bound() #0 {
; CHECK-SAME: ) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[END:%.*]] = load ptr, ptr addrspace(1) @generic_end, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[END]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP2]] to ptr
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[I:%.*]] = phi ptr addrspace(3) [ @array, %[[ENTRY]] ], [ [[I2:%.*]], %[[LOOP]] ]
@@ -166,7 +168,7 @@ define amdgpu_kernel void @loop_with_generic_bound() #0 {
; CHECK-NEXT: call void @use(float [[V]])
; CHECK-NEXT: [[I2]] = getelementptr float, ptr addrspace(3) [[I]], i64 1
; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr
-; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq ptr [[TMP0]], [[END]]
+; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]]
; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/InferAddressSpaces/NVPTX/load-ptr.ll b/llvm/test/Transforms/InferAddressSpaces/NVPTX/load-ptr.ll
index fb3f55ab89497..96c92bb1bb443 100644
--- a/llvm/test/Transforms/InferAddressSpaces/NVPTX/load-ptr.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/NVPTX/load-ptr.ll
@@ -10,10 +10,11 @@ define ptx_kernel void @globalmem_flat_ptr_with_global(ptr %a, ptr %b){
; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1)
; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(1)
; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(1) [[TMP0]], align 8
+; CHECK-NEXT: [[DOTGLOBAL:%.*]] = addrspacecast ptr [[TMP2]] to ptr addrspace(1)
; CHECK-NEXT: [[TMP3:%.*]] = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x()
; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[TMP3]] to i64
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i64 [[IDXPROM]]
-; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[DOTGLOBAL]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP1]], i64 [[IDXPROM]]
; CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 4
; CHECK-NEXT: ret void
@@ -29,23 +30,24 @@ entry:
ret void
}
- at shared_ptrs = internal unnamed_addr addrspace(3) global [32 x ptr] undef, align 8
+ at shared_ptrs = internal unnamed_addr addrspace(3) global [32 x ptr] poison, align 8
define ptx_kernel void @sharedmem_flat_ptr_with_global(ptr %a, ptr %b) {
; CHECK-LABEL: define ptx_kernel void @sharedmem_flat_ptr_with_global(
; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1)
-; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr
; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(1)
; CHECK-NEXT: [[TMP3:%.*]] = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x()
; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[TMP3]] to i64
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP0]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = addrspacecast ptr addrspace(1) [[ARRAYIDX1]] to ptr
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds ptr, ptr addrspace(3) @shared_ptrs, i64 [[IDXPROM]]
; CHECK-NEXT: store ptr [[ARRAYIDX]], ptr addrspace(3) [[ARRAYIDX3]], align 8
; CHECK-NEXT: tail call void @llvm.nvvm.bar.warp.sync(i32 -1)
; CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr addrspace(3) [[ARRAYIDX3]], align 8
-; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-NEXT: [[DOTGLOBAL:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(1) [[DOTGLOBAL]], align 4
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP2]], i64 [[IDXPROM]]
; CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[ARRAYIDX9]], align 4
; CHECK-NEXT: ret void
@@ -73,11 +75,13 @@ define dso_local ptx_kernel void @device_var_with_global(ptr %b) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(1)
; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr addrspace(1) @a, align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
+; CHECK-NEXT: [[DOTGLOBAL1:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(1) [[DOTGLOBAL1]], align 8
+; CHECK-NEXT: [[DOTGLOBAL:%.*]] = addrspacecast ptr [[TMP2]] to ptr addrspace(1)
; CHECK-NEXT: [[TMP3:%.*]] = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x()
; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[TMP3]] to i64
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP2]], i64 [[IDXPROM]]
-; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[DOTGLOBAL]], i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP0]], i64 [[IDXPROM]]
; CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 4
; CHECK-NEXT: ret void
@@ -102,13 +106,15 @@ define ptx_kernel void @globalmem_flat_ptr_with_global_clobber(ptr %a, ptr %b) {
; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1)
; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(1)
; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(1) [[TMP0]], align 8
+; CHECK-NEXT: [[DOTGLOBAL:%.*]] = addrspacecast ptr [[TMP2]] to ptr addrspace(1)
+; CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(1) [[DOTGLOBAL]] to ptr
; CHECK-NEXT: [[TMP4:%.*]] = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x()
; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[TMP4]] to i64
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[TMP1]], i64 [[IDXPROM]]
-; CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(1) [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP2]], align 4
-; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 4
-; CHECK-NEXT: store i32 [[TMP5]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT: store ptr [[TMP3]], ptr addrspace(1) [[ARRAYIDX]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(1) [[DOTGLOBAL]], align 4
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DOTGLOBAL]], i64 4
+; CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[ARRAYIDX4]], align 4
; CHECK-NEXT: ret void
;
entry:
@@ -127,7 +133,7 @@ entry:
}
- at s_int2 = internal addrspace(3) global [2 x i32] undef, align 4
+ at s_int2 = internal addrspace(3) global [2 x i32] poison, align 4
; Function Attrs: convergent mustprogress noinline norecurse nounwind
define dso_local ptx_kernel void @phi_clobber_with_diff_as(ptr %a, ptr %b) {
@@ -205,7 +211,8 @@ define ptx_kernel void @phi_clobber_with_same_as(ptr %a, ptr %b) {
; CHECK-NEXT: br label %[[IF_END]]
; CHECK: [[IF_END]]:
; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+; CHECK-NEXT: [[DOTGLOBAL:%.*]] = addrspacecast ptr [[TMP3]] to ptr addrspace(3)
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(3) [[DOTGLOBAL]], align 4
; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP1]], i64 [[IDXPROM]]
; CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX7]], align 4
; CHECK-NEXT: ret void
More information about the llvm-commits
mailing list