[llvm-branch-commits] [llvm] [AMDGPU][gfx1250] Add `cu-store` option (PR #150588)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Jul 25 01:04:36 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mc
Author: Pierre van Houtryve (Pierre-vh)
<details>
<summary>Changes</summary>
Determines whether we can use `SCOPE_CU` stores (on by default), or
whether all stores must be done at `SCOPE_SE` minimum.
---
Full diff: https://github.com/llvm/llvm-project/pull/150588.diff
11 Files Affected:
- (modified) llvm/docs/AMDGPUUsage.rst (+8-1)
- (modified) llvm/include/llvm/Support/AMDHSAKernelDescriptor.h (+2-1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPU.td (+7)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp (+5-1)
- (modified) llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp (+6)
- (modified) llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp (+3)
- (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+3)
- (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp (+5)
- (modified) llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (+3-1)
- (added) llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll (+100)
- (modified) llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-errors.test (+4-4)
``````````diff
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index e46437ae092c4..caaae1c3673a3 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -768,6 +768,9 @@ For example:
performant than code generated for XNACK replay
disabled.
+ cu-stores TODO On GFX12.5, controls whether ``scope:SCOPE_CU`` stores may be used.
+ If disabled, all stores will be done at ``scope:SCOPE_SE`` or greater.
+
=============== ============================ ==================================================
.. _amdgpu-target-id:
@@ -5107,7 +5110,9 @@ The fields used by CP for code objects before V3 also match those specified in
and must be 0,
>454 1 bit ENABLE_SGPR_PRIVATE_SEGMENT
_SIZE
- 457:455 3 bits Reserved, must be 0.
+ 455 1 bit USES_CU_STORES GFX12.5: Whether the ``cu-stores`` target attribute is enabled.
+ If 0, then all stores are ``SCOPE_SE`` or higher.
+ 457:456 2 bits Reserved, must be 0.
458 1 bit ENABLE_WAVEFRONT_SIZE32 GFX6-GFX9
Reserved, must be 0.
GFX10-GFX11
@@ -18185,6 +18190,8 @@ terminated by an ``.end_amdhsa_kernel`` directive.
GFX942)
``.amdhsa_user_sgpr_private_segment_size`` 0 GFX6-GFX12 Controls ENABLE_SGPR_PRIVATE_SEGMENT_SIZE in
:ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`.
+ ``.amdhsa_uses_cu_stores`` 0 GFX12.5 Controls USES_CU_STORES in
+ :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`.
``.amdhsa_wavefront_size32`` Target GFX10-GFX12 Controls ENABLE_WAVEFRONT_SIZE32 in
Feature :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`.
Specific
diff --git a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
index a119b0724d677..8f367390c531c 100644
--- a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
+++ b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
@@ -223,7 +223,8 @@ enum : int32_t {
KERNEL_CODE_PROPERTY(ENABLE_SGPR_DISPATCH_ID, 4, 1),
KERNEL_CODE_PROPERTY(ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1),
KERNEL_CODE_PROPERTY(ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, 6, 1),
- KERNEL_CODE_PROPERTY(RESERVED0, 7, 3),
+ KERNEL_CODE_PROPERTY(RESERVED0, 7, 2),
+ KERNEL_CODE_PROPERTY(USES_CU_STORES, 9, 1), // GFX12.5 +cu-stores
KERNEL_CODE_PROPERTY(ENABLE_WAVEFRONT_SIZE32, 10, 1), // GFX10+
KERNEL_CODE_PROPERTY(USES_DYNAMIC_STACK, 11, 1),
KERNEL_CODE_PROPERTY(RESERVED1, 12, 4),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 2a36f3dea34ce..ca53059680e12 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -268,6 +268,12 @@ def FeatureSafeSmemPrefetch : SubtargetFeature<"safe-smem-prefetch",
"SMEM prefetches do not fail on illegal address"
>;
+def FeatureCUStores : SubtargetFeature<"cu-stores",
+ "HasCUStores",
+ "true",
+ "Whether SCOPE_CU stores can be used on GFX12.5"
+>;
+
def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard",
"HasVcmpxExecWARHazard",
"true",
@@ -1970,6 +1976,7 @@ def FeatureISAVersion12 : FeatureSet<
def FeatureISAVersion12_50 : FeatureSet<
[FeatureGFX12,
FeatureGFX1250Insts,
+ FeatureCUStores,
FeatureCuMode,
Feature64BitLiterals,
FeatureLDSBankCount32,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 4b3dc371c65f0..668139383f56c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -552,6 +552,7 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
MCContext &Ctx = MF.getContext();
uint16_t KernelCodeProperties = 0;
const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
KernelCodeProperties |=
@@ -581,10 +582,13 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
}
- if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
+ if (ST.isWave32()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
}
+ if (isGFX1250(ST) && ST.hasCUStores()) {
+ KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES;
+ }
// CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
// un-evaluatable at this point so it cannot be conditionally checked here.
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 421fc429048ff..44e65b3588888 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -6066,6 +6066,12 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
ExprVal, ValRange);
if (Val)
ImpliedUserSGPRCount += 1;
+ } else if (ID == ".amdhsa_uses_cu_stores") {
+ if (!isGFX1250())
+ return Error(IDRange.Start, "directive requires gfx12.5", IDRange);
+
+ PARSE_BITS_ENTRY(KD.kernel_code_properties,
+ KERNEL_CODE_PROPERTY_USES_CU_STORES, ExprVal, ValRange);
} else if (ID == ".amdhsa_wavefront_size32") {
EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
if (IVersion.Major < 10)
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 5c1989b345bdc..ffe6b0649cb94 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -2556,6 +2556,9 @@ Expected<bool> AMDGPUDisassembler::decodeKernelDescriptorDirective(
KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size",
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
+ if (isGFX1250())
+ PRINT_DIRECTIVE(".amdhsa_uses_cu_stores",
+ KERNEL_CODE_PROPERTY_USES_CU_STORES);
if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0)
return createReservedKDBitsError(KERNEL_CODE_PROPERTY_RESERVED0,
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 0435e7f9e51d2..84f2676602950 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -245,6 +245,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasSMEMtoVectorWriteHazard = false;
bool HasInstFwdPrefetchBug = false;
bool HasSafeSmemPrefetch = false;
+ bool HasCUStores = false;
bool HasVcmpxExecWARHazard = false;
bool HasLdsBranchVmemWARHazard = false;
bool HasNSAtoVMEMBug = false;
@@ -989,6 +990,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; }
+ bool hasCUStores() const { return HasCUStores; }
+
// Has s_cmpk_* instructions.
bool hasSCmpK() const { return getGeneration() < GFX12; }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 10f6d3382368f..43ca54894b963 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -440,6 +440,11 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
".amdhsa_user_sgpr_private_segment_size");
+ if (isGFX1250(STI))
+ PrintField(KD.kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES_SHIFT,
+ amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES,
+ ".amdhsa_uses_cu_stores");
if (IVersion.Major >= 10)
PrintField(KD.kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT,
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index d6337a85a7361..315dac555ab80 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -2554,7 +2554,9 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
// GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
// space.
- if (TII->mayAccessScratchThroughFlat(MI) && Scope == CPol::SCOPE_CU)
+ // We also require SCOPE_SE minimum if we not have the "cu-stores" feature.
+ if (Scope == CPol::SCOPE_CU &&
+ (!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI)))
return setScope(MI, CPol::SCOPE_SE);
return false;
diff --git a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
new file mode 100644
index 0000000000000..d13d76fcfabf4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
@@ -0,0 +1,100 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O3 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O3 -mcpu=gfx1250 -mattr=-cu-stores < %s | FileCheck --check-prefixes=GCN,NOCU %s
+
+; Check that if -cu-stores is used, we use SCOPE_SE minimum on all stores.
+
+; GCN: flat_store:
+; CU: flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; NOCU: flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; GCN: .amdhsa_kernel flat_store
+; CU: .amdhsa_uses_cu_stores 1
+; NOCU: .amdhsa_uses_cu_stores 0
+define amdgpu_kernel void @flat_store(ptr %dst, i32 %val) {
+entry:
+ store i32 %val, ptr %dst
+ ret void
+}
+
+; GCN: global_store:
+; CU: global_store_b32 v{{.*}}, v{{.*}}, s{{.*}}{{$}}
+; NOCU: global_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; GCN: .amdhsa_kernel global_store
+; CU: .amdhsa_uses_cu_stores 1
+; NOCU: .amdhsa_uses_cu_stores 0
+define amdgpu_kernel void @global_store(ptr addrspace(1) %dst, i32 %val) {
+entry:
+ store i32 %val, ptr addrspace(1) %dst
+ ret void
+}
+
+; GCN: local_store:
+; CU: ds_store_b32 v{{.*}}, v{{.*}}{{$}}
+; NOCU: ds_store_b32 v{{.*}}, v{{.*}}{{$}}
+; GCN: .amdhsa_kernel local_store
+; CU: .amdhsa_uses_cu_stores 1
+; NOCU: .amdhsa_uses_cu_stores 0
+define amdgpu_kernel void @local_store(ptr addrspace(3) %dst, i32 %val) {
+entry:
+ store i32 %val, ptr addrspace(3) %dst
+ ret void
+}
+
+; GCN: scratch_store:
+; CU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; NOCU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; GCN: .amdhsa_kernel scratch_store
+; CU: .amdhsa_uses_cu_stores 1
+; NOCU: .amdhsa_uses_cu_stores 0
+define amdgpu_kernel void @scratch_store(ptr addrspace(5) %dst, i32 %val) {
+entry:
+ store i32 %val, ptr addrspace(5) %dst
+ ret void
+}
+
+; GCN: flat_atomic_store:
+; CU: flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; NOCU: flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; GCN: .amdhsa_kernel flat_atomic_store
+; CU: .amdhsa_uses_cu_stores 1
+; NOCU: .amdhsa_uses_cu_stores 0
+define amdgpu_kernel void @flat_atomic_store(ptr %dst, i32 %val) {
+entry:
+ store atomic i32 %val, ptr %dst syncscope("wavefront") unordered, align 4
+ ret void
+}
+
+; GCN: global_atomic_store:
+; CU: global_store_b32 v{{.*}}, v{{.*}}, s{{.*}}{{$}}
+; NOCU: global_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; GCN: .amdhsa_kernel global_atomic_store
+; CU: .amdhsa_uses_cu_stores 1
+; NOCU: .amdhsa_uses_cu_stores 0
+define amdgpu_kernel void @global_atomic_store(ptr addrspace(1) %dst, i32 %val) {
+entry:
+ store atomic i32 %val, ptr addrspace(1) %dst syncscope("wavefront") unordered, align 4
+ ret void
+}
+
+; GCN: local_atomic_store:
+; CU: ds_store_b32 v{{.*}}, v{{.*}}{{$}}
+; NOCU: ds_store_b32 v{{.*}}, v{{.*}}{{$}}
+; GCN: .amdhsa_kernel local_atomic_store
+; CU: .amdhsa_uses_cu_stores 1
+; NOCU: .amdhsa_uses_cu_stores 0
+define amdgpu_kernel void @local_atomic_store(ptr addrspace(3) %dst, i32 %val) {
+entry:
+ store atomic i32 %val, ptr addrspace(3) %dst syncscope("wavefront") unordered, align 4
+ ret void
+}
+
+; GCN: scratch_atomic_store:
+; CU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; NOCU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; GCN: .amdhsa_kernel scratch_atomic_store
+; CU: .amdhsa_uses_cu_stores 1
+; NOCU: .amdhsa_uses_cu_stores 0
+define amdgpu_kernel void @scratch_atomic_store(ptr addrspace(5) %dst, i32 %val) {
+entry:
+ store atomic i32 %val, ptr addrspace(5) %dst syncscope("wavefront") unordered, align 4
+ ret void
+}
diff --git a/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-errors.test b/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-errors.test
index fdca11b95caa6..369005f4ea432 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-errors.test
+++ b/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-errors.test
@@ -13,10 +13,10 @@
# RES_4_2: ; error decoding test.kd: kernel descriptor reserved bits in range (511:480) set
# RES_4_2-NEXT: ; decoding failed region as bytes
-# RUN: yaml2obj %s -DGPU=GFX90A -DKD=00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000006000000000000 \
-# RUN: | llvm-objdump --disassemble-symbols=test.kd - | FileCheck %s --check-prefix=RES_457
-# RES_457: ; error decoding test.kd: kernel descriptor reserved bits in range (457:455) set
-# RES_457-NEXT: ; decoding failed region as bytes
+# RUN: yaml2obj %s -DGPU=GFX90A -DKD=00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003000000000000 \
+# RUN: | llvm-objdump --disassemble-symbols=test.kd - | FileCheck %s --check-prefix=RES_456
+# RES_456: ; error decoding test.kd: kernel descriptor reserved bits in range (456:455) set
+# RES_456-NEXT: ; decoding failed region as bytes
# RUN: yaml2obj %s -DGPU=GFX90A -DKD=0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000c000000000000 \
# RUN: | llvm-objdump --disassemble-symbols=test.kd - | FileCheck %s --check-prefix=WF32
``````````
</details>
https://github.com/llvm/llvm-project/pull/150588
More information about the llvm-branch-commits
mailing list