[llvm] 67c55b1 - [AMDGPU] Make max dwords of memory cluster configurable (#119342)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 17 22:17:30 PST 2024
Author: Ruiling, Song
Date: 2024-12-18T14:17:27+08:00
New Revision: 67c55b1ffc0b09cac66d8b18ada1e876d9312173
URL: https://github.com/llvm/llvm-project/commit/67c55b1ffc0b09cac66d8b18ada1e876d9312173
DIFF: https://github.com/llvm/llvm-project/commit/67c55b1ffc0b09cac66d8b18ada1e876d9312173.diff
LOG: [AMDGPU] Make max dwords of memory cluster configurable (#119342)
We find it helpful to increase the value for graphics workload. Make it
configurable so we can experiment with a different value.
Added:
Modified:
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.h
llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 057412d41e7a2f..df0320fd0f177b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -554,31 +554,38 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
unsigned NumBytes) const {
// If the mem ops (to be clustered) do not have the same base ptr, then they
// should not be clustered
+ unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
if (!BaseOps1.empty() && !BaseOps2.empty()) {
const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
return false;
+
+ const SIMachineFunctionInfo *MFI =
+ FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
+ MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
} else if (!BaseOps1.empty() || !BaseOps2.empty()) {
// If only one base op is empty, they do not have the same base ptr
return false;
}
// In order to avoid register pressure, on an average, the number of DWORDS
- // loaded together by all clustered mem ops should not exceed 8. This is an
- // empirical value based on certain observations and performance related
- // experiments.
+ // loaded together by all clustered mem ops should not exceed
+ // MaxMemoryClusterDWords. This is an empirical value based on certain
+ // observations and performance related experiments.
// The good thing about this heuristic is - it avoids clustering of too many
// sub-word loads, and also avoids clustering of wide loads. Below is the
- // brief summary of how the heuristic behaves for various `LoadSize`.
+ // brief summary of how the heuristic behaves for various `LoadSize` when
+ // MaxMemoryClusterDWords is 8.
+ //
// (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
// (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
// (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
// (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
// (5) LoadSize >= 17: do not cluster
const unsigned LoadSize = NumBytes / ClusterSize;
- const unsigned NumDWORDs = ((LoadSize + 3) / 4) * ClusterSize;
- return NumDWORDs <= 8;
+ const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
+ return NumDWords <= MaxMemoryClusterDWords;
}
// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 960fbb7ea15ce7..8f9ca6141816d4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -36,6 +36,8 @@ class RegScavenger;
class TargetRegisterClass;
class ScheduleHazardRecognizer;
+constexpr unsigned DefaultMemoryClusterDWordsLimit = 8;
+
/// Mark the MMO of a uniform load if there are no potentially clobbering stores
/// on any path from the start of an entry function to this load.
static const MachineMemOperand::Flags MONoClobber =
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 1e43d2727a00da..8d7df73f3cee86 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -163,6 +163,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
if (!S.empty())
S.consumeInteger(0, HighBitsOf32BitAddress);
+ MaxMemoryClusterDWords = F.getFnAttributeAsParsedInteger(
+ "amdgpu-max-memory-cluster-dwords", DefaultMemoryClusterDWordsLimit);
+
// On GFX908, in order to guarantee copying between AGPRs, we need a scratch
// VGPR available at all times. For now, reserve highest available VGPR. After
// RA, shift it to the lowest available unused VGPR if the one exist.
@@ -694,8 +697,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
const llvm::MachineFunction &MF)
: ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
- GDSSize(MFI.getGDSSize()),
- DynLDSAlign(MFI.getDynLDSAlign()), IsEntryFunction(MFI.isEntryFunction()),
+ GDSSize(MFI.getGDSSize()), DynLDSAlign(MFI.getDynLDSAlign()),
+ IsEntryFunction(MFI.isEntryFunction()),
NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
@@ -708,8 +711,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
BytesInStackArgArea(MFI.getBytesInStackArgArea()),
ReturnsVoid(MFI.returnsVoid()),
ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
- PSInputAddr(MFI.getPSInputAddr()),
- PSInputEnable(MFI.getPSInputEnable()),
+ PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
+ MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
Mode(MFI.getMode()) {
for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
SpillPhysVGPRS.push_back(regToString(Reg, TRI));
@@ -744,6 +747,7 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
DynLDSAlign = YamlMFI.DynLDSAlign;
PSInputAddr = YamlMFI.PSInputAddr;
PSInputEnable = YamlMFI.PSInputEnable;
+ MaxMemoryClusterDWords = YamlMFI.MaxMemoryClusterDWords;
HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
Occupancy = YamlMFI.Occupancy;
IsEntryFunction = YamlMFI.IsEntryFunction;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 2a754680fdc8ca..2e2716f1ce8889 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -289,6 +289,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
unsigned PSInputAddr = 0;
unsigned PSInputEnable = 0;
+ unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
SIMode Mode;
std::optional<FrameIndex> ScavengeFI;
@@ -333,6 +334,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("argumentInfo", MFI.ArgInfo);
YamlIO.mapOptional("psInputAddr", MFI.PSInputAddr, 0u);
YamlIO.mapOptional("psInputEnable", MFI.PSInputEnable, 0u);
+ YamlIO.mapOptional("maxMemoryClusterDWords", MFI.MaxMemoryClusterDWords,
+ DefaultMemoryClusterDWordsLimit);
YamlIO.mapOptional("mode", MFI.Mode, SIMode());
YamlIO.mapOptional("highBitsOf32BitAddress",
MFI.HighBitsOf32BitAddress, 0u);
@@ -487,6 +490,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
// Current recorded maximum possible occupancy.
unsigned Occupancy;
+ // Maximum number of dwords that can be clusterred during instruction
+ // scheduler stage.
+ unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
+
mutable std::optional<bool> UsesAGPRs;
MCPhysReg getNextUserSGPR() const;
@@ -1109,6 +1116,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
limitOccupancy(MF);
}
+ unsigned getMaxMemoryClusterDWords() const { return MaxMemoryClusterDWords; }
+
bool mayNeedAGPRs() const {
return MayNeedAGPRs;
}
diff --git a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
index 7f587ac0b87161..80d4fa69be4255 100644
--- a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
+++ b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
@@ -4,7 +4,7 @@
define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 inreg noundef %userdata6, i32 inreg noundef %userdata7, i32 inreg noundef %userdata8, i32 inreg noundef %PrimMask, <2 x float> noundef %PerspInterpSample, <2 x float> noundef %PerspInterpCenter, <2 x float> noundef %PerspInterpCentroid) #2 {
; GFX11-LABEL: group_image_sample:
; GFX11: ; %bb.0: ; %.entry
-; GFX11-NEXT: s_mov_b32 s24, exec_lo
+; GFX11-NEXT: s_mov_b32 s33, exec_lo
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-NEXT: s_mov_b32 m0, s4
; GFX11-NEXT: s_getpc_b64 s[4:5]
@@ -21,73 +21,79 @@ define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 in
; GFX11-NEXT: lds_param_load v2, attr0.y wait_vdst:15
; GFX11-NEXT: lds_param_load v3, attr0.x wait_vdst:15
; GFX11-NEXT: s_mov_b32 exec_lo, s16
+; GFX11-NEXT: v_interp_p10_f32 v4, v2, v0, v2 wait_exp:1
+; GFX11-NEXT: v_interp_p10_f32 v0, v3, v0, v3 wait_exp:0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0xf
; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0x10
; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x20
; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0x30
; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0x40
-; GFX11-NEXT: v_interp_p10_f32 v4, v2, v0, v2 wait_exp:1
-; GFX11-NEXT: v_interp_p10_f32 v0, v3, v0, v3 wait_exp:0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_buffer_load_b64 s[24:25], s[12:15], 0x50
+; GFX11-NEXT: s_buffer_load_b64 s[26:27], s[12:15], 0x60
+; GFX11-NEXT: s_buffer_load_b64 s[28:29], s[12:15], 0x70
+; GFX11-NEXT: s_buffer_load_b64 s[30:31], s[12:15], 0x80
+; GFX11-NEXT: s_buffer_load_b64 s[34:35], s[12:15], 0x90
+; GFX11-NEXT: s_buffer_load_b64 s[36:37], s[12:15], 0xa0
+; GFX11-NEXT: s_buffer_load_b64 s[38:39], s[12:15], 0xb0
+; GFX11-NEXT: s_buffer_load_b64 s[40:41], s[12:15], 0xc0
+; GFX11-NEXT: s_buffer_load_b64 s[42:43], s[12:15], 0xd0
+; GFX11-NEXT: s_buffer_load_b64 s[44:45], s[12:15], 0xe0
+; GFX11-NEXT: s_buffer_load_b64 s[46:47], s[12:15], 0xf0
+; GFX11-NEXT: s_buffer_load_b64 s[12:13], s[12:15], 0x100
+; GFX11-NEXT: v_interp_p2_f32 v36, v2, v1, v4 wait_exp:7
; GFX11-NEXT: v_interp_p2_f32 v0, v3, v1, v0 wait_exp:7
-; GFX11-NEXT: v_interp_p2_f32 v1, v2, v1, v4 wait_exp:7
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_add_f32 v4, s16, v0 :: v_dual_add_f32 v5, s17, v1
-; GFX11-NEXT: v_dual_add_f32 v12, s20, v0 :: v_dual_add_f32 v13, s21, v1
-; GFX11-NEXT: v_dual_add_f32 v8, s18, v0 :: v_dual_add_f32 v9, s19, v1
-; GFX11-NEXT: v_dual_add_f32 v16, s22, v0 :: v_dual_add_f32 v17, s23, v1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v5, s17, v36
+; GFX11-NEXT: v_add_f32_e32 v4, s16, v0
+; GFX11-NEXT: v_add_f32_e32 v8, s18, v0
+; GFX11-NEXT: v_add_f32_e32 v9, s19, v36
+; GFX11-NEXT: v_add_f32_e32 v12, s20, v0
+; GFX11-NEXT: v_add_f32_e32 v13, s21, v36
+; GFX11-NEXT: v_add_f32_e32 v16, s22, v0
+; GFX11-NEXT: v_add_f32_e32 v17, s23, v36
+; GFX11-NEXT: v_add_f32_e32 v20, s24, v0
+; GFX11-NEXT: v_add_f32_e32 v21, s25, v36
+; GFX11-NEXT: v_add_f32_e32 v24, s26, v0
+; GFX11-NEXT: v_add_f32_e32 v25, s27, v36
+; GFX11-NEXT: v_add_f32_e32 v28, s28, v0
+; GFX11-NEXT: v_add_f32_e32 v29, s29, v36
+; GFX11-NEXT: v_add_f32_e32 v32, s30, v0
+; GFX11-NEXT: v_add_f32_e32 v33, s31, v36
+; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: image_sample v[4:7], v[4:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[8:11], v[8:9], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[12:15], v[12:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[16:19], v[16:17], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: s_clause 0x3
-; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0x50
-; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x60
-; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0x70
-; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0x80
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_add_f32 v20, s16, v0 :: v_dual_add_f32 v21, s17, v1
-; GFX11-NEXT: v_dual_add_f32 v28, s20, v0 :: v_dual_add_f32 v29, s21, v1
-; GFX11-NEXT: v_dual_add_f32 v24, s18, v0 :: v_dual_add_f32 v25, s19, v1
-; GFX11-NEXT: v_dual_add_f32 v32, s22, v0 :: v_dual_add_f32 v33, s23, v1
-; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: image_sample v[20:23], v[20:21], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[24:27], v[24:25], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[28:31], v[28:29], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[32:35], v[32:33], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: s_clause 0x3
-; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0x90
-; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0xa0
-; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0xb0
-; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0xc0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_add_f32 v36, s16, v0 :: v_dual_add_f32 v37, s17, v1
-; GFX11-NEXT: v_dual_add_f32 v44, s20, v0 :: v_dual_add_f32 v45, s21, v1
-; GFX11-NEXT: v_dual_add_f32 v40, s18, v0 :: v_dual_add_f32 v41, s19, v1
-; GFX11-NEXT: v_dual_add_f32 v48, s22, v0 :: v_dual_add_f32 v49, s23, v1
-; GFX11-NEXT: s_clause 0x3
-; GFX11-NEXT: image_sample v[36:39], v[36:37], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: v_add_f32_e32 v37, s34, v0
+; GFX11-NEXT: v_add_f32_e32 v38, s35, v36
+; GFX11-NEXT: v_add_f32_e32 v40, s36, v0
+; GFX11-NEXT: v_add_f32_e32 v41, s37, v36
+; GFX11-NEXT: v_add_f32_e32 v44, s38, v0
+; GFX11-NEXT: v_add_f32_e32 v45, s39, v36
+; GFX11-NEXT: v_add_f32_e32 v48, s40, v0
+; GFX11-NEXT: v_add_f32_e32 v49, s41, v36
+; GFX11-NEXT: v_add_f32_e32 v52, s42, v0
+; GFX11-NEXT: v_add_f32_e32 v53, s43, v36
+; GFX11-NEXT: v_add_f32_e32 v56, s44, v0
+; GFX11-NEXT: v_add_f32_e32 v57, s45, v36
+; GFX11-NEXT: v_add_f32_e32 v60, s46, v0
+; GFX11-NEXT: v_add_f32_e32 v61, s47, v36
+; GFX11-NEXT: v_add_f32_e32 v0, s12, v0
+; GFX11-NEXT: v_add_f32_e32 v1, s13, v36
+; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s33
+; GFX11-NEXT: s_clause 0x7
+; GFX11-NEXT: image_sample v[36:39], v[37:38], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[40:43], v[40:41], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[44:47], v[44:45], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[48:51], v[48:49], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: s_clause 0x3
-; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0xd0
-; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0xe0
-; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0xf0
-; GFX11-NEXT: s_buffer_load_b64 s[12:13], s[12:15], 0x100
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_add_f32 v52, s16, v0 :: v_dual_add_f32 v53, s17, v1
-; GFX11-NEXT: v_dual_add_f32 v56, s18, v0 :: v_dual_add_f32 v57, s19, v1
-; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_sample v[52:55], v[52:53], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[56:59], v[56:57], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: v_dual_add_f32 v60, s20, v0 :: v_dual_add_f32 v61, s21, v1
-; GFX11-NEXT: v_dual_add_f32 v0, s12, v0 :: v_dual_add_f32 v1, s13, v1
-; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s24
-; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_sample v[60:63], v[60:61], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[64:67], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(14)
@@ -446,7 +452,7 @@ declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #3
declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #3
declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg) #8
-attributes #2 = { alwaysinline nounwind memory(readwrite) "amdgpu-sched-strategy"="max-memory-clause"}
+attributes #2 = { alwaysinline nounwind memory(readwrite) "amdgpu-sched-strategy"="max-memory-clause" "amdgpu-max-memory-cluster-dwords"="32"}
attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #4 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
attributes #5 = { nocallback nofree nosync nounwind willreturn memory(read) }
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
index 0f7a5f8e0941ad..eb4ee118ec2e42 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
@@ -29,6 +29,7 @@
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 8
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
@@ -295,6 +296,7 @@
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 8
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
index 7759501ea42268..6f5467b00ebcc5 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
@@ -29,6 +29,7 @@
; AFTER-PEI-NEXT: workItemIDX: { reg: '$vgpr0' }
; AFTER-PEI-NEXT: psInputAddr: 0
; AFTER-PEI-NEXT: psInputEnable: 0
+; AFTER-PEI-NEXT: maxMemoryClusterDWords: 8
; AFTER-PEI-NEXT: mode:
; AFTER-PEI-NEXT: ieee: true
; AFTER-PEI-NEXT: dx10-clamp: true
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
index 4545c8bbeb3e6c..d1d8240a1007a2 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@@ -30,6 +30,7 @@
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 8
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
index 8215ba834170f2..ad6e92a25b8615 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
@@ -30,6 +30,7 @@
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 8
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
index 51795a4fea515e..3eff89239d5418 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
@@ -39,6 +39,7 @@
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# FULL-NEXT: psInputAddr: 0
# FULL-NEXT: psInputEnable: 0
+# FULL-NEXT: maxMemoryClusterDWords: 8
# FULL-NEXT: mode:
# FULL-NEXT: ieee: true
# FULL-NEXT: dx10-clamp: true
@@ -143,6 +144,7 @@ body: |
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# FULL-NEXT: psInputAddr: 0
# FULL-NEXT: psInputEnable: 0
+# FULL-NEXT: maxMemoryClusterDWords: 8
# FULL-NEXT: mode:
# FULL-NEXT: ieee: true
# FULL-NEXT: dx10-clamp: true
@@ -218,6 +220,7 @@ body: |
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# FULL-NEXT: psInputAddr: 0
# FULL-NEXT: psInputEnable: 0
+# FULL-NEXT: maxMemoryClusterDWords: 8
# FULL-NEXT: mode:
# FULL-NEXT: ieee: true
# FULL-NEXT: dx10-clamp: true
@@ -294,6 +297,7 @@ body: |
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# FULL-NEXT: psInputAddr: 0
# FULL-NEXT: psInputEnable: 0
+# FULL-NEXT: maxMemoryClusterDWords: 8
# FULL-NEXT: mode:
# FULL-NEXT: ieee: true
# FULL-NEXT: dx10-clamp: true
@@ -593,3 +597,15 @@ body: |
%2:sgpr_64 = COPY %1
%1:sgpr_64 = COPY %0
...
+
+---
+# ALL-LABEL: name: max_memory_cluster_dwords
+# ALL: maxMemoryClusterDWords: 16
+name: max_memory_cluster_dwords
+machineFunctionInfo:
+ maxMemoryClusterDWords: 16
+body: |
+ bb.0:
+ SI_RETURN
+
+...
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
index 077d22fc895ae5..eca3f99b64955b 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@@ -40,6 +40,7 @@
; CHECK-NEXT: workItemIDZ: { reg: '$vgpr2' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 8
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
@@ -86,6 +87,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
; CHECK-NEXT: implicitBufferPtr: { reg: '$sgpr0_sgpr1' }
; CHECK-NEXT: psInputAddr: 1
; CHECK-NEXT: psInputEnable: 1
+; CHECK-NEXT: maxMemoryClusterDWords: 8
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: false
; CHECK-NEXT: dx10-clamp: true
@@ -156,6 +158,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
; CHECK-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 8
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
@@ -208,6 +211,7 @@ define void @function() {
; CHECK-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 8
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
More information about the llvm-commits
mailing list