[llvm] [AMDGPU] Mark address space cast from private to flat as divergent if target supports globally addressable scratch (PR #152376)
Shilei Tian via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 6 13:27:47 PDT 2025
https://github.com/shiltian created https://github.com/llvm/llvm-project/pull/152376
Globally addressable scratch is a new feature introduced in gfx1250. However, this feature changes how scratch space is mapped into the flat aperture, making address space casts from private to flat no longer uniform.
>From 18d9ec996b56a41533b8e428820e7cfdbfaf1bae Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Wed, 6 Aug 2025 16:23:26 -0400
Subject: [PATCH] [AMDGPU] Mark address space cast from private to flat as
divergent if target supports globally addressable scratch
Globally addressable scratch is a new feature introduced in gfx1250. However, this feature changes how scratch space is mapped into the flat aperture, making address space casts from private to flat no longer uniform.
---
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 26 ++++++++++++--
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 25 +++++++++++++
.../AMDGPU/MIR/addrspacecast.mir | 35 +++++++++++++++++++
.../AMDGPU/addrspacecast.ll | 14 ++++++++
4 files changed, 97 insertions(+), 3 deletions(-)
create mode 100644 llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/addrspacecast.mir
create mode 100644 llvm/test/Analysis/UniformityAnalysis/AMDGPU/addrspacecast.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index a0c99b0ef0491..846a0b6280f19 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -991,10 +991,21 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
return true;
if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
- if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)
+ Intrinsic::ID IID = Intrinsic->getIntrinsicID();
+ switch (IID) {
+ case Intrinsic::read_register:
return isReadRegisterSourceOfDivergence(Intrinsic);
-
- return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
+ case Intrinsic::amdgcn_addrspacecast_nonnull: {
+ unsigned SrcAS =
+ Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();
+ unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();
+ return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
+ DstAS == AMDGPUAS::FLAT_ADDRESS &&
+ ST->hasGloballyAddressableScratch();
+ }
+ default:
+ return AMDGPU::isIntrinsicSourceOfDivergence(IID);
+ }
}
// Assume all function calls are a source of divergence.
@@ -1008,6 +1019,15 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
if (isa<InvokeInst>(V))
return true;
+ // If the target supports globally addressable scratch, the mapping from
+ // scratch memory to the flat aperture changes therefore an address space cast
+ // is no longer uniform.
+ if (auto *CastI = dyn_cast<AddrSpaceCastInst>(V)) {
+ return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
+ CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
+ ST->hasGloballyAddressableScratch();
+ }
+
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 5f498a3f5a421..f20b22d14c984 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10074,7 +10074,30 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
InstructionUniformity
SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
unsigned opcode = MI.getOpcode();
+
+ auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
+ : MI.getOperand(1).getReg();
+ LLT DstTy = MRI.getType(Dst);
+ LLT SrcTy = MRI.getType(Src);
+ unsigned DstAS = DstTy.getAddressSpace();
+ unsigned SrcAS = SrcTy.getAddressSpace();
+ return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
+ DstAS == AMDGPUAS::FLAT_ADDRESS &&
+ ST.hasGloballyAddressableScratch()
+ ? InstructionUniformity::NeverUniform
+ : InstructionUniformity::Default;
+ };
+
+ // If the target supports globally addressable scratch, the mapping from
+ // scratch memory to the flat aperture changes therefore an address space cast
+ // is no longer uniform.
+ if (opcode == TargetOpcode::G_ADDRSPACE_CAST)
+ return HandleAddrSpaceCast(MI);
+
if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
auto IID = GI->getIntrinsicID();
if (AMDGPU::isIntrinsicSourceOfDivergence(IID))
@@ -10083,6 +10106,8 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
return InstructionUniformity::AlwaysUniform;
switch (IID) {
+ case Intrinsic::amdgcn_addrspacecast_nonnull:
+ return HandleAddrSpaceCast(MI);
case Intrinsic::amdgcn_if:
case Intrinsic::amdgcn_else:
// FIXME: Uniform if second result
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/addrspacecast.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/addrspacecast.mir
new file mode 100644
index 0000000000000..612f7b7ef4ec4
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/addrspacecast.mir
@@ -0,0 +1,35 @@
+# NOTE: This file is Generic MIR translation of llvm/test/Analysis/UniformityAnalysis/AMDGPU/addrspacecast.ll test file
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=print-machine-uniformity -filetype=null %s 2>&1 | FileCheck %s --check-prefix=UNI
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=print-machine-uniformity -filetype=null %s 2>&1 | FileCheck %s --check-prefix=DIV
+
+# UNI: ALL VALUES UNIFORM
+# DIV: DIVERGENT: %3: %3:_(p0) = G_ADDRSPACE_CAST %2:_(p5)
+# DIV: DIVERGENT: %4: %4:_(p0) = G_INTRINSIC intrinsic(@llvm.amdgcn.addrspacecast.nonnull), %2:_(p5)
+
+--- |
+ define void @foo() {
+ %alloca = alloca i32, align 4, addrspace(5)
+ %cast = addrspacecast ptr addrspace(5) %alloca to ptr
+ store i32 1, ptr %cast, align 4
+ %cast.1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) %alloca)
+ store i32 2, ptr %cast.1, align 4
+ ret void
+ }
+...
+---
+name: foo
+stack:
+ - { id: 0, name: alloca, type: default, offset: 0, size: 4, alignment: 4,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+body: |
+ bb.1 (%ir-block.0):
+ %10:_(s32) = G_CONSTANT i32 1
+ %12:_(s32) = G_CONSTANT i32 2
+ %8:_(p5) = G_FRAME_INDEX %stack.0.alloca
+ %9:_(p0) = G_ADDRSPACE_CAST %8(p5)
+ G_STORE %10(s32), %9(p0) :: (store (s32) into %ir.cast)
+ %11:_(p0) = G_INTRINSIC intrinsic(@llvm.amdgcn.addrspacecast.nonnull), %8(p5)
+ G_STORE %12(s32), %11(p0) :: (store (s32) into %ir.cast.1)
+ SI_RETURN
+...
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/addrspacecast.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/addrspacecast.ll
new file mode 100644
index 0000000000000..e6808448651c8
--- /dev/null
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/addrspacecast.ll
@@ -0,0 +1,14 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s --check-prefix=UNI
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s --check-prefix=DIV
+
+; UNI: ALL VALUES UNIFORM
+; DIV: DIVERGENT: %cast = addrspacecast ptr addrspace(5) %alloca to ptr
+; DIV: DIVERGENT: %cast.1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) %alloca)
+define void @foo() {
+ %alloca = alloca i32, align 4, addrspace(5)
+ %cast = addrspacecast ptr addrspace(5) %alloca to ptr
+ store i32 1, ptr %cast
+ %cast.1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) %alloca)
+ store i32 2, ptr %cast.1
+ ret void
+}
More information about the llvm-commits
mailing list