[llvm] [AMDGPU] Make max dwords of memory cluster configurable (PR #119342)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 16 23:53:49 PST 2024
https://github.com/ruiling updated https://github.com/llvm/llvm-project/pull/119342
>From 7a8af305e1bc2fc60e30d8486c94215d1d1206ca Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Wed, 27 Nov 2024 16:10:20 +0800
Subject: [PATCH 1/3] [AMDGPU] Make max dwords of memory cluster configurable
We find it helpful to increase the value for graphics workload.
Make it configurable so we can experiment with a different value.
It might be more helpful we can have a per-function value, but I am not
sure how this can be done properly.
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 4a94d690297949..ed651adcc8cd2e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -60,6 +60,11 @@ static cl::opt<bool> Fix16BitCopies(
cl::init(true),
cl::ReallyHidden);
+static cl::opt<unsigned> MaxMemoryClusterDWORDS(
+ "amdgpu-max-memory-cluster-dwords", cl::Hidden, cl::init(8),
+ cl::desc(
+ "Restrict the maximum dwords for memory cluster during scheduler"));
+
SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
: AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
RI(ST), ST(ST) {
@@ -565,12 +570,14 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
}
// In order to avoid register pressure, on an average, the number of DWORDS
- // loaded together by all clustered mem ops should not exceed 8. This is an
- // empirical value based on certain observations and performance related
- // experiments.
+ // loaded together by all clustered mem ops should not exceed
+ // MaxMemoryClusterDWORDS. This is an empirical value based on certain
+ // observations and performance related experiments.
// The good thing about this heuristic is - it avoids clustering of too many
// sub-word loads, and also avoids clustering of wide loads. Below is the
- // brief summary of how the heuristic behaves for various `LoadSize`.
+ // brief summary of how the heuristic behaves for various `LoadSize` when
+ // MaxMemoryClusterDWORDS is 8.
+ //
// (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
// (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
// (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
@@ -578,7 +585,7 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
// (5) LoadSize >= 17: do not cluster
const unsigned LoadSize = NumBytes / ClusterSize;
const unsigned NumDWORDs = ((LoadSize + 3) / 4) * ClusterSize;
- return NumDWORDs <= 8;
+ return NumDWORDs <= MaxMemoryClusterDWORDS;
}
// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
>From e749e58c52b36846d49a31d68249e145fe883b31 Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Tue, 17 Dec 2024 10:35:39 +0800
Subject: [PATCH 2/3] Add new function attribute
amdgpu-max-memory-cluster-dwords
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 19 ++--
.../Target/AMDGPU/SIMachineFunctionInfo.cpp | 14 ++-
.../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 9 ++
.../AMDGPU/group-image-instructions.ll | 104 +++++++++---------
.../AMDGPU/long-branch-reg-all-sgpr-used.ll | 2 +
.../AMDGPU/machine-function-info-after-pei.ll | 1 +
...ine-function-info-long-branch-reg-debug.ll | 1 +
.../machine-function-info-long-branch-reg.ll | 1 +
.../AMDGPU/machine-function-info-no-ir.mir | 4 +
.../MIR/AMDGPU/machine-function-info.ll | 4 +
10 files changed, 97 insertions(+), 62 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index ed651adcc8cd2e..f8f604222f3a6f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -60,11 +60,6 @@ static cl::opt<bool> Fix16BitCopies(
cl::init(true),
cl::ReallyHidden);
-static cl::opt<unsigned> MaxMemoryClusterDWORDS(
- "amdgpu-max-memory-cluster-dwords", cl::Hidden, cl::init(8),
- cl::desc(
- "Restrict the maximum dwords for memory cluster during scheduler"));
-
SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
: AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
RI(ST), ST(ST) {
@@ -559,11 +554,17 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
unsigned NumBytes) const {
// If the mem ops (to be clustered) do not have the same base ptr, then they
// should not be clustered
+ unsigned MaxMemoryClusterDWords = 8;
if (!BaseOps1.empty() && !BaseOps2.empty()) {
const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
return false;
+
+ const SIMachineFunctionInfo *MFI =
+ FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
+ if (MFI->getMaxMemoryClusterDWords())
+ MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
} else if (!BaseOps1.empty() || !BaseOps2.empty()) {
// If only one base op is empty, they do not have the same base ptr
return false;
@@ -571,12 +572,12 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
// In order to avoid register pressure, on an average, the number of DWORDS
// loaded together by all clustered mem ops should not exceed
- // MaxMemoryClusterDWORDS. This is an empirical value based on certain
+ // MaxMemoryClusterDWords. This is an empirical value based on certain
// observations and performance related experiments.
// The good thing about this heuristic is - it avoids clustering of too many
// sub-word loads, and also avoids clustering of wide loads. Below is the
// brief summary of how the heuristic behaves for various `LoadSize` when
- // MaxMemoryClusterDWORDS is 8.
+ // MaxMemoryClusterDWords is 8.
//
// (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
// (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
@@ -584,8 +585,8 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
// (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
// (5) LoadSize >= 17: do not cluster
const unsigned LoadSize = NumBytes / ClusterSize;
- const unsigned NumDWORDs = ((LoadSize + 3) / 4) * ClusterSize;
- return NumDWORDs <= MaxMemoryClusterDWORDS;
+ const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
+ return NumDWords <= MaxMemoryClusterDWords;
}
// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 1e43d2727a00da..d331b89b4f2e63 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -163,6 +163,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
if (!S.empty())
S.consumeInteger(0, HighBitsOf32BitAddress);
+ A = F.getFnAttribute("amdgpu-max-memory-cluster-dwords");
+ S = A.getValueAsString();
+ if (!S.empty())
+ S.consumeInteger(0, MaxMemoryClusterDWords);
+
// On GFX908, in order to guarantee copying between AGPRs, we need a scratch
// VGPR available at all times. For now, reserve highest available VGPR. After
// RA, shift it to the lowest available unused VGPR if the one exist.
@@ -694,8 +699,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
const llvm::MachineFunction &MF)
: ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
- GDSSize(MFI.getGDSSize()),
- DynLDSAlign(MFI.getDynLDSAlign()), IsEntryFunction(MFI.isEntryFunction()),
+ GDSSize(MFI.getGDSSize()), DynLDSAlign(MFI.getDynLDSAlign()),
+ IsEntryFunction(MFI.isEntryFunction()),
NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
@@ -708,8 +713,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
BytesInStackArgArea(MFI.getBytesInStackArgArea()),
ReturnsVoid(MFI.returnsVoid()),
ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
- PSInputAddr(MFI.getPSInputAddr()),
- PSInputEnable(MFI.getPSInputEnable()),
+ PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
+ MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
Mode(MFI.getMode()) {
for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
SpillPhysVGPRS.push_back(regToString(Reg, TRI));
@@ -744,6 +749,7 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
DynLDSAlign = YamlMFI.DynLDSAlign;
PSInputAddr = YamlMFI.PSInputAddr;
PSInputEnable = YamlMFI.PSInputEnable;
+ MaxMemoryClusterDWords = YamlMFI.MaxMemoryClusterDWords;
HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
Occupancy = YamlMFI.Occupancy;
IsEntryFunction = YamlMFI.IsEntryFunction;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 2a754680fdc8ca..340c7ee0d957af 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -289,6 +289,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
unsigned PSInputAddr = 0;
unsigned PSInputEnable = 0;
+ unsigned MaxMemoryClusterDWords = 0;
SIMode Mode;
std::optional<FrameIndex> ScavengeFI;
@@ -333,6 +334,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("argumentInfo", MFI.ArgInfo);
YamlIO.mapOptional("psInputAddr", MFI.PSInputAddr, 0u);
YamlIO.mapOptional("psInputEnable", MFI.PSInputEnable, 0u);
+ YamlIO.mapOptional("maxMemoryClusterDWords", MFI.MaxMemoryClusterDWords,
+ 0u);
YamlIO.mapOptional("mode", MFI.Mode, SIMode());
YamlIO.mapOptional("highBitsOf32BitAddress",
MFI.HighBitsOf32BitAddress, 0u);
@@ -487,6 +490,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
// Current recorded maximum possible occupancy.
unsigned Occupancy;
+ // Maximum number of dwords that can be clusterred during instruction
+ // scheduler stage.
+ unsigned MaxMemoryClusterDWords = 0;
+
mutable std::optional<bool> UsesAGPRs;
MCPhysReg getNextUserSGPR() const;
@@ -1109,6 +1116,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
limitOccupancy(MF);
}
+ unsigned getMaxMemoryClusterDWords() const { return MaxMemoryClusterDWords; }
+
bool mayNeedAGPRs() const {
return MayNeedAGPRs;
}
diff --git a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
index 7f587ac0b87161..80d4fa69be4255 100644
--- a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
+++ b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
@@ -4,7 +4,7 @@
define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 inreg noundef %userdata6, i32 inreg noundef %userdata7, i32 inreg noundef %userdata8, i32 inreg noundef %PrimMask, <2 x float> noundef %PerspInterpSample, <2 x float> noundef %PerspInterpCenter, <2 x float> noundef %PerspInterpCentroid) #2 {
; GFX11-LABEL: group_image_sample:
; GFX11: ; %bb.0: ; %.entry
-; GFX11-NEXT: s_mov_b32 s24, exec_lo
+; GFX11-NEXT: s_mov_b32 s33, exec_lo
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-NEXT: s_mov_b32 m0, s4
; GFX11-NEXT: s_getpc_b64 s[4:5]
@@ -21,73 +21,79 @@ define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 in
; GFX11-NEXT: lds_param_load v2, attr0.y wait_vdst:15
; GFX11-NEXT: lds_param_load v3, attr0.x wait_vdst:15
; GFX11-NEXT: s_mov_b32 exec_lo, s16
+; GFX11-NEXT: v_interp_p10_f32 v4, v2, v0, v2 wait_exp:1
+; GFX11-NEXT: v_interp_p10_f32 v0, v3, v0, v3 wait_exp:0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0xf
; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0x10
; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x20
; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0x30
; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0x40
-; GFX11-NEXT: v_interp_p10_f32 v4, v2, v0, v2 wait_exp:1
-; GFX11-NEXT: v_interp_p10_f32 v0, v3, v0, v3 wait_exp:0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_buffer_load_b64 s[24:25], s[12:15], 0x50
+; GFX11-NEXT: s_buffer_load_b64 s[26:27], s[12:15], 0x60
+; GFX11-NEXT: s_buffer_load_b64 s[28:29], s[12:15], 0x70
+; GFX11-NEXT: s_buffer_load_b64 s[30:31], s[12:15], 0x80
+; GFX11-NEXT: s_buffer_load_b64 s[34:35], s[12:15], 0x90
+; GFX11-NEXT: s_buffer_load_b64 s[36:37], s[12:15], 0xa0
+; GFX11-NEXT: s_buffer_load_b64 s[38:39], s[12:15], 0xb0
+; GFX11-NEXT: s_buffer_load_b64 s[40:41], s[12:15], 0xc0
+; GFX11-NEXT: s_buffer_load_b64 s[42:43], s[12:15], 0xd0
+; GFX11-NEXT: s_buffer_load_b64 s[44:45], s[12:15], 0xe0
+; GFX11-NEXT: s_buffer_load_b64 s[46:47], s[12:15], 0xf0
+; GFX11-NEXT: s_buffer_load_b64 s[12:13], s[12:15], 0x100
+; GFX11-NEXT: v_interp_p2_f32 v36, v2, v1, v4 wait_exp:7
; GFX11-NEXT: v_interp_p2_f32 v0, v3, v1, v0 wait_exp:7
-; GFX11-NEXT: v_interp_p2_f32 v1, v2, v1, v4 wait_exp:7
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_add_f32 v4, s16, v0 :: v_dual_add_f32 v5, s17, v1
-; GFX11-NEXT: v_dual_add_f32 v12, s20, v0 :: v_dual_add_f32 v13, s21, v1
-; GFX11-NEXT: v_dual_add_f32 v8, s18, v0 :: v_dual_add_f32 v9, s19, v1
-; GFX11-NEXT: v_dual_add_f32 v16, s22, v0 :: v_dual_add_f32 v17, s23, v1
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v5, s17, v36
+; GFX11-NEXT: v_add_f32_e32 v4, s16, v0
+; GFX11-NEXT: v_add_f32_e32 v8, s18, v0
+; GFX11-NEXT: v_add_f32_e32 v9, s19, v36
+; GFX11-NEXT: v_add_f32_e32 v12, s20, v0
+; GFX11-NEXT: v_add_f32_e32 v13, s21, v36
+; GFX11-NEXT: v_add_f32_e32 v16, s22, v0
+; GFX11-NEXT: v_add_f32_e32 v17, s23, v36
+; GFX11-NEXT: v_add_f32_e32 v20, s24, v0
+; GFX11-NEXT: v_add_f32_e32 v21, s25, v36
+; GFX11-NEXT: v_add_f32_e32 v24, s26, v0
+; GFX11-NEXT: v_add_f32_e32 v25, s27, v36
+; GFX11-NEXT: v_add_f32_e32 v28, s28, v0
+; GFX11-NEXT: v_add_f32_e32 v29, s29, v36
+; GFX11-NEXT: v_add_f32_e32 v32, s30, v0
+; GFX11-NEXT: v_add_f32_e32 v33, s31, v36
+; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: image_sample v[4:7], v[4:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[8:11], v[8:9], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[12:15], v[12:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[16:19], v[16:17], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: s_clause 0x3
-; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0x50
-; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x60
-; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0x70
-; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0x80
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_add_f32 v20, s16, v0 :: v_dual_add_f32 v21, s17, v1
-; GFX11-NEXT: v_dual_add_f32 v28, s20, v0 :: v_dual_add_f32 v29, s21, v1
-; GFX11-NEXT: v_dual_add_f32 v24, s18, v0 :: v_dual_add_f32 v25, s19, v1
-; GFX11-NEXT: v_dual_add_f32 v32, s22, v0 :: v_dual_add_f32 v33, s23, v1
-; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: image_sample v[20:23], v[20:21], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[24:27], v[24:25], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[28:31], v[28:29], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[32:35], v[32:33], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: s_clause 0x3
-; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0x90
-; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0xa0
-; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0xb0
-; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0xc0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_add_f32 v36, s16, v0 :: v_dual_add_f32 v37, s17, v1
-; GFX11-NEXT: v_dual_add_f32 v44, s20, v0 :: v_dual_add_f32 v45, s21, v1
-; GFX11-NEXT: v_dual_add_f32 v40, s18, v0 :: v_dual_add_f32 v41, s19, v1
-; GFX11-NEXT: v_dual_add_f32 v48, s22, v0 :: v_dual_add_f32 v49, s23, v1
-; GFX11-NEXT: s_clause 0x3
-; GFX11-NEXT: image_sample v[36:39], v[36:37], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: v_add_f32_e32 v37, s34, v0
+; GFX11-NEXT: v_add_f32_e32 v38, s35, v36
+; GFX11-NEXT: v_add_f32_e32 v40, s36, v0
+; GFX11-NEXT: v_add_f32_e32 v41, s37, v36
+; GFX11-NEXT: v_add_f32_e32 v44, s38, v0
+; GFX11-NEXT: v_add_f32_e32 v45, s39, v36
+; GFX11-NEXT: v_add_f32_e32 v48, s40, v0
+; GFX11-NEXT: v_add_f32_e32 v49, s41, v36
+; GFX11-NEXT: v_add_f32_e32 v52, s42, v0
+; GFX11-NEXT: v_add_f32_e32 v53, s43, v36
+; GFX11-NEXT: v_add_f32_e32 v56, s44, v0
+; GFX11-NEXT: v_add_f32_e32 v57, s45, v36
+; GFX11-NEXT: v_add_f32_e32 v60, s46, v0
+; GFX11-NEXT: v_add_f32_e32 v61, s47, v36
+; GFX11-NEXT: v_add_f32_e32 v0, s12, v0
+; GFX11-NEXT: v_add_f32_e32 v1, s13, v36
+; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s33
+; GFX11-NEXT: s_clause 0x7
+; GFX11-NEXT: image_sample v[36:39], v[37:38], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[40:43], v[40:41], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[44:47], v[44:45], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[48:51], v[48:49], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: s_clause 0x3
-; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0xd0
-; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0xe0
-; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0xf0
-; GFX11-NEXT: s_buffer_load_b64 s[12:13], s[12:15], 0x100
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_add_f32 v52, s16, v0 :: v_dual_add_f32 v53, s17, v1
-; GFX11-NEXT: v_dual_add_f32 v56, s18, v0 :: v_dual_add_f32 v57, s19, v1
-; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_sample v[52:55], v[52:53], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[56:59], v[56:57], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: v_dual_add_f32 v60, s20, v0 :: v_dual_add_f32 v61, s21, v1
-; GFX11-NEXT: v_dual_add_f32 v0, s12, v0 :: v_dual_add_f32 v1, s13, v1
-; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s24
-; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_sample v[60:63], v[60:61], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[64:67], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(14)
@@ -446,7 +452,7 @@ declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #3
declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #3
declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg) #8
-attributes #2 = { alwaysinline nounwind memory(readwrite) "amdgpu-sched-strategy"="max-memory-clause"}
+attributes #2 = { alwaysinline nounwind memory(readwrite) "amdgpu-sched-strategy"="max-memory-clause" "amdgpu-max-memory-cluster-dwords"="32"}
attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #4 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
attributes #5 = { nocallback nofree nosync nounwind willreturn memory(read) }
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
index 0f7a5f8e0941ad..35f5ac30e0593e 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
@@ -29,6 +29,7 @@
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 0
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
@@ -295,6 +296,7 @@
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 0
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
index 7759501ea42268..2782cb35ba4034 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
@@ -29,6 +29,7 @@
; AFTER-PEI-NEXT: workItemIDX: { reg: '$vgpr0' }
; AFTER-PEI-NEXT: psInputAddr: 0
; AFTER-PEI-NEXT: psInputEnable: 0
+; AFTER-PEI-NEXT: maxMemoryClusterDWords: 0
; AFTER-PEI-NEXT: mode:
; AFTER-PEI-NEXT: ieee: true
; AFTER-PEI-NEXT: dx10-clamp: true
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
index 4545c8bbeb3e6c..4e2efe9c9d8b1a 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@@ -30,6 +30,7 @@
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 0
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
index 8215ba834170f2..7559067047ac8b 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
@@ -30,6 +30,7 @@
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 0
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
index 51795a4fea515e..e5ffe7a2a95067 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
@@ -39,6 +39,7 @@
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# FULL-NEXT: psInputAddr: 0
# FULL-NEXT: psInputEnable: 0
+# FULL-NEXT: maxMemoryClusterDWords: 0
# FULL-NEXT: mode:
# FULL-NEXT: ieee: true
# FULL-NEXT: dx10-clamp: true
@@ -143,6 +144,7 @@ body: |
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# FULL-NEXT: psInputAddr: 0
# FULL-NEXT: psInputEnable: 0
+# FULL-NEXT: maxMemoryClusterDWords: 0
# FULL-NEXT: mode:
# FULL-NEXT: ieee: true
# FULL-NEXT: dx10-clamp: true
@@ -218,6 +220,7 @@ body: |
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# FULL-NEXT: psInputAddr: 0
# FULL-NEXT: psInputEnable: 0
+# FULL-NEXT: maxMemoryClusterDWords: 0
# FULL-NEXT: mode:
# FULL-NEXT: ieee: true
# FULL-NEXT: dx10-clamp: true
@@ -294,6 +297,7 @@ body: |
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# FULL-NEXT: psInputAddr: 0
# FULL-NEXT: psInputEnable: 0
+# FULL-NEXT: maxMemoryClusterDWords: 0
# FULL-NEXT: mode:
# FULL-NEXT: ieee: true
# FULL-NEXT: dx10-clamp: true
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
index 077d22fc895ae5..7c29827beb8902 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@@ -40,6 +40,7 @@
; CHECK-NEXT: workItemIDZ: { reg: '$vgpr2' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 0
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
@@ -86,6 +87,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
; CHECK-NEXT: implicitBufferPtr: { reg: '$sgpr0_sgpr1' }
; CHECK-NEXT: psInputAddr: 1
; CHECK-NEXT: psInputEnable: 1
+; CHECK-NEXT: maxMemoryClusterDWords: 0
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: false
; CHECK-NEXT: dx10-clamp: true
@@ -156,6 +158,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
; CHECK-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 0
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
@@ -208,6 +211,7 @@ define void @function() {
; CHECK-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 0
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
>From c4f164e19e30084029d938ab22f16646236261c5 Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Tue, 17 Dec 2024 15:51:39 +0800
Subject: [PATCH 3/3] Address review comments
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 5 ++---
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 ++
llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 6 ++----
llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h | 2 +-
.../MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll | 4 ++--
.../MIR/AMDGPU/machine-function-info-after-pei.ll | 2 +-
.../machine-function-info-long-branch-reg-debug.ll | 2 +-
.../AMDGPU/machine-function-info-long-branch-reg.ll | 2 +-
.../MIR/AMDGPU/machine-function-info-no-ir.mir | 12 ++++++++++++
.../test/CodeGen/MIR/AMDGPU/machine-function-info.ll | 8 ++++----
10 files changed, 28 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index f8f604222f3a6f..9c1b76f516dadf 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -554,7 +554,7 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
unsigned NumBytes) const {
// If the mem ops (to be clustered) do not have the same base ptr, then they
// should not be clustered
- unsigned MaxMemoryClusterDWords = 8;
+ unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
if (!BaseOps1.empty() && !BaseOps2.empty()) {
const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
@@ -563,8 +563,7 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
const SIMachineFunctionInfo *MFI =
FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
- if (MFI->getMaxMemoryClusterDWords())
- MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
+ MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
} else if (!BaseOps1.empty() || !BaseOps2.empty()) {
// If only one base op is empty, they do not have the same base ptr
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 960fbb7ea15ce7..8f9ca6141816d4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -36,6 +36,8 @@ class RegScavenger;
class TargetRegisterClass;
class ScheduleHazardRecognizer;
+constexpr unsigned DefaultMemoryClusterDWordsLimit = 8;
+
/// Mark the MMO of a uniform load if there are no potentially clobbering stores
/// on any path from the start of an entry function to this load.
static const MachineMemOperand::Flags MONoClobber =
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index d331b89b4f2e63..8d7df73f3cee86 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -163,10 +163,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
if (!S.empty())
S.consumeInteger(0, HighBitsOf32BitAddress);
- A = F.getFnAttribute("amdgpu-max-memory-cluster-dwords");
- S = A.getValueAsString();
- if (!S.empty())
- S.consumeInteger(0, MaxMemoryClusterDWords);
+ MaxMemoryClusterDWords = F.getFnAttributeAsParsedInteger(
+ "amdgpu-max-memory-cluster-dwords", DefaultMemoryClusterDWordsLimit);
// On GFX908, in order to guarantee copying between AGPRs, we need a scratch
// VGPR available at all times. For now, reserve highest available VGPR. After
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 340c7ee0d957af..bc4665adb67768 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -492,7 +492,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
// Maximum number of dwords that can be clusterred during instruction
// scheduler stage.
- unsigned MaxMemoryClusterDWords = 0;
+ unsigned MaxMemoryClusterDWords;
mutable std::optional<bool> UsesAGPRs;
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
index 35f5ac30e0593e..eb4ee118ec2e42 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
@@ -29,7 +29,7 @@
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
-; CHECK-NEXT: maxMemoryClusterDWords: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 8
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
@@ -296,7 +296,7 @@
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
-; CHECK-NEXT: maxMemoryClusterDWords: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 8
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
index 2782cb35ba4034..6f5467b00ebcc5 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
@@ -29,7 +29,7 @@
; AFTER-PEI-NEXT: workItemIDX: { reg: '$vgpr0' }
; AFTER-PEI-NEXT: psInputAddr: 0
; AFTER-PEI-NEXT: psInputEnable: 0
-; AFTER-PEI-NEXT: maxMemoryClusterDWords: 0
+; AFTER-PEI-NEXT: maxMemoryClusterDWords: 8
; AFTER-PEI-NEXT: mode:
; AFTER-PEI-NEXT: ieee: true
; AFTER-PEI-NEXT: dx10-clamp: true
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
index 4e2efe9c9d8b1a..d1d8240a1007a2 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@@ -30,7 +30,7 @@
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
-; CHECK-NEXT: maxMemoryClusterDWords: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 8
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
index 7559067047ac8b..ad6e92a25b8615 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
@@ -30,7 +30,7 @@
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
-; CHECK-NEXT: maxMemoryClusterDWords: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 8
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
index e5ffe7a2a95067..227a72c7c5c4a1 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
@@ -597,3 +597,15 @@ body: |
%2:sgpr_64 = COPY %1
%1:sgpr_64 = COPY %0
...
+
+---
+# ALL-LABEL: name: max_memory_cluster_dwords
+# ALL: maxMemoryClusterDWords: 16
+name: max_memory_cluster_dwords
+machineFunctionInfo:
+ maxMemoryClusterDWords: 16
+body: |
+ bb.0:
+ SI_RETURN
+
+...
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
index 7c29827beb8902..eca3f99b64955b 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@@ -40,7 +40,7 @@
; CHECK-NEXT: workItemIDZ: { reg: '$vgpr2' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
-; CHECK-NEXT: maxMemoryClusterDWords: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 8
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
@@ -87,7 +87,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
; CHECK-NEXT: implicitBufferPtr: { reg: '$sgpr0_sgpr1' }
; CHECK-NEXT: psInputAddr: 1
; CHECK-NEXT: psInputEnable: 1
-; CHECK-NEXT: maxMemoryClusterDWords: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 8
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: false
; CHECK-NEXT: dx10-clamp: true
@@ -158,7 +158,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
; CHECK-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
-; CHECK-NEXT: maxMemoryClusterDWords: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 8
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
@@ -211,7 +211,7 @@ define void @function() {
; CHECK-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
-; CHECK-NEXT: maxMemoryClusterDWords: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 8
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
More information about the llvm-commits
mailing list