[llvm] [AMDGPU] Add function attribute to disable TBUFFER combine (PR #156454)

Tue Sep 2 05:57:01 PDT 2025

https://github.com/harrisonGPU created https://github.com/llvm/llvm-project/pull/156454

Some graphics features do not allow TBUFFER combine.

When vertex attribute robustness is enabled, unbound vertex attributes must return (0,0,0,0) or (0,0,0,1).

For typed-buffers, if any component of a thread is out of bounds, the entire thread is considered out of bounds and must return zero.
If we combine multiple TBUFFER instructions into a wider load, a single out-of-bounds component would incorrectly cause all components to return zero.

Therefore, TBUFFER combine must be disabled in this case.

>From 68c6589885c370ca6d9b23ad3ededf1eb519b1f7 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Tue, 2 Sep 2025 20:32:12 +0800
Subject: [PATCH] [AMDGPU] Add function attribute to disable TBUFFER combine

---
 .../Target/AMDGPU/SILoadStoreOptimizer.cpp    |  9 +++
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |  3 +
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 10 +++
 .../AMDGPU/tbuffer-combine-disable-attr.mir   | 65 +++++++++++++++++++
 4 files changed, 87 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/tbuffer-combine-disable-attr.mir

diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 6f2ea8ad1ff01..740e574bf70c9 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -62,6 +62,7 @@
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIDefines.h"
+#include "SIMachineFunctionInfo.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/InitializePasses.h"
@@ -2456,6 +2457,14 @@ SILoadStoreOptimizer::collectMergeableInsts(
         LLVM_DEBUG(dbgs() << "Skip tbuffer with unknown format: " << MI);
         continue;
       }
+
+      const MachineFunction *MF = MI.getParent()->getParent();
+      const auto *MFI = MF->getInfo<SIMachineFunctionInfo>();
+      if (MFI->isTBufferCombineDisabled()) {
+        LLVM_DEBUG(
+            dbgs() << "Skip TBUFFER combine: disabled by function attribute\n");
+        continue;
+      }
     }
 
     CombineInfo CI;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 8a1120321af9f..717517e83adfd 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -195,6 +195,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
     VGPRForAGPRCopy =
         AMDGPU::VGPR_32RegClass.getRegister(ST.getMaxNumVGPRs(F) - 1);
   }
+
+  if (F.hasFnAttribute("amdgpu-disable-tbuffer-combine"))
+    setDisableTBufferCombine(true);
 }
 
 MachineFunctionInfo *SIMachineFunctionInfo::clone(
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index ca8f8033a2d54..0e8a0b75c0491 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -293,6 +293,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
   unsigned PSInputEnable = 0;
   unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
 
+  bool DisableTBufferCombine = false;
+
   SIMode Mode;
   std::optional<FrameIndex> ScavengeFI;
   StringValue VGPRForAGPRCopy;
@@ -525,6 +527,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   // scheduler stage.
   unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
 
+  // Disable combining of TBUFFER instructions.
+  bool DisableTBufferCombine = false;
+
   MCPhysReg getNextUserSGPR() const;
 
   MCPhysReg getNextSystemSGPR() const;
@@ -1207,6 +1212,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   unsigned getMaxNumWorkGroupsX() const { return MaxNumWorkGroups[0]; }
   unsigned getMaxNumWorkGroupsY() const { return MaxNumWorkGroups[1]; }
   unsigned getMaxNumWorkGroupsZ() const { return MaxNumWorkGroups[2]; }
+
+  bool isTBufferCombineDisabled() const { return DisableTBufferCombine; }
+  void setDisableTBufferCombine(bool IsDisabled) {
+    DisableTBufferCombine = IsDisabled;
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/test/CodeGen/AMDGPU/tbuffer-combine-disable-attr.mir b/llvm/test/CodeGen/AMDGPU/tbuffer-combine-disable-attr.mir
new file mode 100644
index 0000000000000..cd119aacf7496
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/tbuffer-combine-disable-attr.mir
@@ -0,0 +1,65 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=si-load-store-opt -verify-machineinstrs %s -o - | FileCheck %s
+
+--- |
+  target triple = "amdgcn"
+
+  define float @disable-tbuffer-combine(<4 x i32> %vec, i32 %index) #0 {
+    %1 = call i32 @llvm.amdgcn.struct.tbuffer.load.i32(<4 x i32> %vec, i32 %index, i32 0, i32 0, i32 22, i32 0)
+    %2 = call i32 @llvm.amdgcn.struct.tbuffer.load.i32(<4 x i32> %vec, i32 %index, i32 4, i32 0, i32 22, i32 0)
+    %3 = call i32 @llvm.amdgcn.struct.tbuffer.load.i32(<4 x i32> %vec, i32 %index, i32 8, i32 0, i32 22, i32 0)
+    %4 = call i32 @llvm.amdgcn.struct.tbuffer.load.i32(<4 x i32> %vec, i32 %index, i32 12, i32 0, i32 22, i32 0)
+    %5 = bitcast i32 %1 to float
+    %6 = bitcast i32 %2 to float
+    %7 = bitcast i32 %3 to float
+    %8 = bitcast i32 %4 to float
+    %add = fadd float %5, %6
+    %mul = fmul float %7, %8
+    %res = fadd float %add, %mul
+    ret float %res
+  }
+
+  attributes #0 = {"amdgpu-disable-tbuffer-combine"}
+...
+---
+name:            disable-tbuffer-combine
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+    ; CHECK-LABEL: name: disable-tbuffer-combine
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
+    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+    ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+    ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+    ; CHECK-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+    ; CHECK-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[TBUFFER_LOAD_FORMAT_X_IDXEN]], 0, killed [[TBUFFER_LOAD_FORMAT_X_IDXEN1]], 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed [[TBUFFER_LOAD_FORMAT_X_IDXEN2]], 0, killed [[TBUFFER_LOAD_FORMAT_X_IDXEN3]], 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 0, killed [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_F32_e64_1]]
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+    %12:vgpr_32 = COPY $vgpr4
+    %11:vgpr_32 = COPY $vgpr3
+    %10:vgpr_32 = COPY $vgpr2
+    %9:vgpr_32 = COPY $vgpr1
+    %8:vgpr_32 = COPY $vgpr0
+    %13:sgpr_128 = REG_SEQUENCE %8, %subreg.sub0, %9, %subreg.sub1, %10, %subreg.sub2, %11, %subreg.sub3
+    %14:sreg_32 = S_MOV_B32 0
+    %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %12, %13, %14, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+    %16:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %12, %13, %14, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+    %17:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %12, %13, %14, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+    %18:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %12, %13, %14, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+    %19:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed %15, 0, killed %16, 0, 0, implicit $mode, implicit $exec
+    %20:vgpr_32 = nofpexcept V_MUL_F32_e64 0, killed %17, 0, killed %18, 0, 0, implicit $mode, implicit $exec
+    %21:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed %19, 0, killed %20, 0, 0, implicit $mode, implicit $exec
+    $vgpr0 = COPY %21
+    SI_RETURN implicit $vgpr0
+...