[llvm] [AMDGPU] Add function attribute to disable TBUFFER combine (PR #156454)
Harrison Hao via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 2 05:57:01 PDT 2025
https://github.com/harrisonGPU created https://github.com/llvm/llvm-project/pull/156454
Some graphics features do not allow TBUFFER combine.
When vertex attribute robustness is enabled, unbound vertex attributes must return (0,0,0,0) or (0,0,0,1).
For typed-buffers, if any component of a thread is out of bounds, the entire thread is considered out of bounds and must return zero.
If we combine multiple TBUFFER instructions into a wider load, a single out-of-bounds component would incorrectly cause all components to return zero.
Therefore, TBUFFER combine must be disabled in this case.
>From 68c6589885c370ca6d9b23ad3ededf1eb519b1f7 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Tue, 2 Sep 2025 20:32:12 +0800
Subject: [PATCH] [AMDGPU] Add function attribute to disable TBUFFER combine
---
.../Target/AMDGPU/SILoadStoreOptimizer.cpp | 9 +++
.../Target/AMDGPU/SIMachineFunctionInfo.cpp | 3 +
.../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 10 +++
.../AMDGPU/tbuffer-combine-disable-attr.mir | 65 +++++++++++++++++++
4 files changed, 87 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/tbuffer-combine-disable-attr.mir
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 6f2ea8ad1ff01..740e574bf70c9 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -62,6 +62,7 @@
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIDefines.h"
+#include "SIMachineFunctionInfo.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/InitializePasses.h"
@@ -2456,6 +2457,14 @@ SILoadStoreOptimizer::collectMergeableInsts(
LLVM_DEBUG(dbgs() << "Skip tbuffer with unknown format: " << MI);
continue;
}
+
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const auto *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ if (MFI->isTBufferCombineDisabled()) {
+ LLVM_DEBUG(
+ dbgs() << "Skip TBUFFER combine: disabled by function attribute\n");
+ continue;
+ }
}
CombineInfo CI;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 8a1120321af9f..717517e83adfd 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -195,6 +195,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
VGPRForAGPRCopy =
AMDGPU::VGPR_32RegClass.getRegister(ST.getMaxNumVGPRs(F) - 1);
}
+
+ if (F.hasFnAttribute("amdgpu-disable-tbuffer-combine"))
+ setDisableTBufferCombine(true);
}
MachineFunctionInfo *SIMachineFunctionInfo::clone(
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index ca8f8033a2d54..0e8a0b75c0491 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -293,6 +293,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
unsigned PSInputEnable = 0;
unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
+ bool DisableTBufferCombine = false;
+
SIMode Mode;
std::optional<FrameIndex> ScavengeFI;
StringValue VGPRForAGPRCopy;
@@ -525,6 +527,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
// scheduler stage.
unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
+ // Disable combining of TBUFFER instructions.
+ bool DisableTBufferCombine = false;
+
MCPhysReg getNextUserSGPR() const;
MCPhysReg getNextSystemSGPR() const;
@@ -1207,6 +1212,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
unsigned getMaxNumWorkGroupsX() const { return MaxNumWorkGroups[0]; }
unsigned getMaxNumWorkGroupsY() const { return MaxNumWorkGroups[1]; }
unsigned getMaxNumWorkGroupsZ() const { return MaxNumWorkGroups[2]; }
+
+ bool isTBufferCombineDisabled() const { return DisableTBufferCombine; }
+ void setDisableTBufferCombine(bool IsDisabled) {
+ DisableTBufferCombine = IsDisabled;
+ }
};
} // end namespace llvm
diff --git a/llvm/test/CodeGen/AMDGPU/tbuffer-combine-disable-attr.mir b/llvm/test/CodeGen/AMDGPU/tbuffer-combine-disable-attr.mir
new file mode 100644
index 0000000000000..cd119aacf7496
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/tbuffer-combine-disable-attr.mir
@@ -0,0 +1,65 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=si-load-store-opt -verify-machineinstrs %s -o - | FileCheck %s
+
+--- |
+ target triple = "amdgcn"
+
+ define float @disable-tbuffer-combine(<4 x i32> %vec, i32 %index) #0 {
+ %1 = call i32 @llvm.amdgcn.struct.tbuffer.load.i32(<4 x i32> %vec, i32 %index, i32 0, i32 0, i32 22, i32 0)
+ %2 = call i32 @llvm.amdgcn.struct.tbuffer.load.i32(<4 x i32> %vec, i32 %index, i32 4, i32 0, i32 22, i32 0)
+ %3 = call i32 @llvm.amdgcn.struct.tbuffer.load.i32(<4 x i32> %vec, i32 %index, i32 8, i32 0, i32 22, i32 0)
+ %4 = call i32 @llvm.amdgcn.struct.tbuffer.load.i32(<4 x i32> %vec, i32 %index, i32 12, i32 0, i32 22, i32 0)
+ %5 = bitcast i32 %1 to float
+ %6 = bitcast i32 %2 to float
+ %7 = bitcast i32 %3 to float
+ %8 = bitcast i32 %4 to float
+ %add = fadd float %5, %6
+ %mul = fmul float %7, %8
+ %res = fadd float %add, %mul
+ ret float %res
+ }
+
+ attributes #0 = {"amdgpu-disable-tbuffer-combine"}
+...
+---
+name: disable-tbuffer-combine
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+ ; CHECK-LABEL: name: disable-tbuffer-combine
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[TBUFFER_LOAD_FORMAT_X_IDXEN]], 0, killed [[TBUFFER_LOAD_FORMAT_X_IDXEN1]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[TBUFFER_LOAD_FORMAT_X_IDXEN2]], 0, killed [[TBUFFER_LOAD_FORMAT_X_IDXEN3]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 0, killed [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_F32_e64_1]]
+ ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+ %12:vgpr_32 = COPY $vgpr4
+ %11:vgpr_32 = COPY $vgpr3
+ %10:vgpr_32 = COPY $vgpr2
+ %9:vgpr_32 = COPY $vgpr1
+ %8:vgpr_32 = COPY $vgpr0
+ %13:sgpr_128 = REG_SEQUENCE %8, %subreg.sub0, %9, %subreg.sub1, %10, %subreg.sub2, %11, %subreg.sub3
+ %14:sreg_32 = S_MOV_B32 0
+ %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %12, %13, %14, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ %16:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %12, %13, %14, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ %17:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %12, %13, %14, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ %18:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %12, %13, %14, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ %19:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed %15, 0, killed %16, 0, 0, implicit $mode, implicit $exec
+ %20:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %17, 0, killed %18, 0, 0, implicit $mode, implicit $exec
+ %21:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed %19, 0, killed %20, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %21
+ SI_RETURN implicit $vgpr0
+...
More information about the llvm-commits
mailing list