[llvm] [AMDGPU] Make max dwords of memory cluster configurable (PR #119342)

via llvm-commits llvm-commits at lists.llvm.org
Mon Dec 16 18:45:13 PST 2024


https://github.com/ruiling updated https://github.com/llvm/llvm-project/pull/119342

>From 7a8af305e1bc2fc60e30d8486c94215d1d1206ca Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Wed, 27 Nov 2024 16:10:20 +0800
Subject: [PATCH 1/2] [AMDGPU] Make max dwords of memory cluster configurable

We find it helpful to increase the value for graphics workload.
Make it configurable so we can experiment with a different value.
It might be more helpful we can have a per-function value, but I am not
sure how this can be done properly.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 4a94d690297949..ed651adcc8cd2e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -60,6 +60,11 @@ static cl::opt<bool> Fix16BitCopies(
   cl::init(true),
   cl::ReallyHidden);
 
+static cl::opt<unsigned> MaxMemoryClusterDWORDS(
+    "amdgpu-max-memory-cluster-dwords", cl::Hidden, cl::init(8),
+    cl::desc(
+        "Restrict the maximum dwords for memory cluster during scheduler"));
+
 SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
   : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
     RI(ST), ST(ST) {
@@ -565,12 +570,14 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
   }
 
   // In order to avoid register pressure, on an average, the number of DWORDS
-  // loaded together by all clustered mem ops should not exceed 8. This is an
-  // empirical value based on certain observations and performance related
-  // experiments.
+  // loaded together by all clustered mem ops should not exceed
+  // MaxMemoryClusterDWORDS. This is an empirical value based on certain
+  // observations and performance related experiments.
   // The good thing about this heuristic is - it avoids clustering of too many
   // sub-word loads, and also avoids clustering of wide loads. Below is the
-  // brief summary of how the heuristic behaves for various `LoadSize`.
+  // brief summary of how the heuristic behaves for various `LoadSize` when
+  // MaxMemoryClusterDWORDS is 8.
+  //
   // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
   // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
   // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
@@ -578,7 +585,7 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
   // (5) LoadSize >= 17: do not cluster
   const unsigned LoadSize = NumBytes / ClusterSize;
   const unsigned NumDWORDs = ((LoadSize + 3) / 4) * ClusterSize;
-  return NumDWORDs <= 8;
+  return NumDWORDs <= MaxMemoryClusterDWORDS;
 }
 
 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,

>From e749e58c52b36846d49a31d68249e145fe883b31 Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Tue, 17 Dec 2024 10:35:39 +0800
Subject: [PATCH 2/2] Add new function attribute
 amdgpu-max-memory-cluster-dwords

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  19 ++--
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |  14 ++-
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h |   9 ++
 .../AMDGPU/group-image-instructions.ll        | 104 +++++++++---------
 .../AMDGPU/long-branch-reg-all-sgpr-used.ll   |   2 +
 .../AMDGPU/machine-function-info-after-pei.ll |   1 +
 ...ine-function-info-long-branch-reg-debug.ll |   1 +
 .../machine-function-info-long-branch-reg.ll  |   1 +
 .../AMDGPU/machine-function-info-no-ir.mir    |   4 +
 .../MIR/AMDGPU/machine-function-info.ll       |   4 +
 10 files changed, 97 insertions(+), 62 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index ed651adcc8cd2e..f8f604222f3a6f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -60,11 +60,6 @@ static cl::opt<bool> Fix16BitCopies(
   cl::init(true),
   cl::ReallyHidden);
 
-static cl::opt<unsigned> MaxMemoryClusterDWORDS(
-    "amdgpu-max-memory-cluster-dwords", cl::Hidden, cl::init(8),
-    cl::desc(
-        "Restrict the maximum dwords for memory cluster during scheduler"));
-
 SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
   : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
     RI(ST), ST(ST) {
@@ -559,11 +554,17 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
                                       unsigned NumBytes) const {
   // If the mem ops (to be clustered) do not have the same base ptr, then they
   // should not be clustered
+  unsigned MaxMemoryClusterDWords = 8;
   if (!BaseOps1.empty() && !BaseOps2.empty()) {
     const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
     const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
     if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
       return false;
+
+    const SIMachineFunctionInfo *MFI =
+        FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
+    if (MFI->getMaxMemoryClusterDWords())
+      MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
   } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
     // If only one base op is empty, they do not have the same base ptr
     return false;
@@ -571,12 +572,12 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
 
   // In order to avoid register pressure, on an average, the number of DWORDS
   // loaded together by all clustered mem ops should not exceed
-  // MaxMemoryClusterDWORDS. This is an empirical value based on certain
+  // MaxMemoryClusterDWords. This is an empirical value based on certain
   // observations and performance related experiments.
   // The good thing about this heuristic is - it avoids clustering of too many
   // sub-word loads, and also avoids clustering of wide loads. Below is the
   // brief summary of how the heuristic behaves for various `LoadSize` when
-  // MaxMemoryClusterDWORDS is 8.
+  // MaxMemoryClusterDWords is 8.
   //
   // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
   // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
@@ -584,8 +585,8 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
   // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
   // (5) LoadSize >= 17: do not cluster
   const unsigned LoadSize = NumBytes / ClusterSize;
-  const unsigned NumDWORDs = ((LoadSize + 3) / 4) * ClusterSize;
-  return NumDWORDs <= MaxMemoryClusterDWORDS;
+  const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
+  return NumDWords <= MaxMemoryClusterDWords;
 }
 
 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 1e43d2727a00da..d331b89b4f2e63 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -163,6 +163,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   if (!S.empty())
     S.consumeInteger(0, HighBitsOf32BitAddress);
 
+  A = F.getFnAttribute("amdgpu-max-memory-cluster-dwords");
+  S = A.getValueAsString();
+  if (!S.empty())
+    S.consumeInteger(0, MaxMemoryClusterDWords);
+
   // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
   // VGPR available at all times. For now, reserve highest available VGPR. After
   // RA, shift it to the lowest available unused VGPR if the one exist.
@@ -694,8 +699,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
     const llvm::MachineFunction &MF)
     : ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
       MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
-      GDSSize(MFI.getGDSSize()),
-      DynLDSAlign(MFI.getDynLDSAlign()), IsEntryFunction(MFI.isEntryFunction()),
+      GDSSize(MFI.getGDSSize()), DynLDSAlign(MFI.getDynLDSAlign()),
+      IsEntryFunction(MFI.isEntryFunction()),
       NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
       MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
       HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
@@ -708,8 +713,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
       BytesInStackArgArea(MFI.getBytesInStackArgArea()),
       ReturnsVoid(MFI.returnsVoid()),
       ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
-      PSInputAddr(MFI.getPSInputAddr()),
-      PSInputEnable(MFI.getPSInputEnable()),
+      PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
+      MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
       Mode(MFI.getMode()) {
   for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
     SpillPhysVGPRS.push_back(regToString(Reg, TRI));
@@ -744,6 +749,7 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
   DynLDSAlign = YamlMFI.DynLDSAlign;
   PSInputAddr = YamlMFI.PSInputAddr;
   PSInputEnable = YamlMFI.PSInputEnable;
+  MaxMemoryClusterDWords = YamlMFI.MaxMemoryClusterDWords;
   HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
   Occupancy = YamlMFI.Occupancy;
   IsEntryFunction = YamlMFI.IsEntryFunction;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 2a754680fdc8ca..340c7ee0d957af 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -289,6 +289,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
 
   unsigned PSInputAddr = 0;
   unsigned PSInputEnable = 0;
+  unsigned MaxMemoryClusterDWords = 0;
 
   SIMode Mode;
   std::optional<FrameIndex> ScavengeFI;
@@ -333,6 +334,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
     YamlIO.mapOptional("argumentInfo", MFI.ArgInfo);
     YamlIO.mapOptional("psInputAddr", MFI.PSInputAddr, 0u);
     YamlIO.mapOptional("psInputEnable", MFI.PSInputEnable, 0u);
+    YamlIO.mapOptional("maxMemoryClusterDWords", MFI.MaxMemoryClusterDWords,
+                       0u);
     YamlIO.mapOptional("mode", MFI.Mode, SIMode());
     YamlIO.mapOptional("highBitsOf32BitAddress",
                        MFI.HighBitsOf32BitAddress, 0u);
@@ -487,6 +490,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   // Current recorded maximum possible occupancy.
   unsigned Occupancy;
 
+  // Maximum number of dwords that can be clusterred during instruction
+  // scheduler stage.
+  unsigned MaxMemoryClusterDWords = 0;
+
   mutable std::optional<bool> UsesAGPRs;
 
   MCPhysReg getNextUserSGPR() const;
@@ -1109,6 +1116,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
     limitOccupancy(MF);
   }
 
+  unsigned getMaxMemoryClusterDWords() const { return MaxMemoryClusterDWords; }
+
   bool mayNeedAGPRs() const {
     return MayNeedAGPRs;
   }
diff --git a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
index 7f587ac0b87161..80d4fa69be4255 100644
--- a/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
+++ b/llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
@@ -4,7 +4,7 @@
 define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 inreg noundef %userdata6, i32 inreg noundef %userdata7, i32 inreg noundef %userdata8, i32 inreg noundef %PrimMask, <2 x float> noundef %PerspInterpSample, <2 x float> noundef %PerspInterpCenter, <2 x float> noundef %PerspInterpCentroid) #2 {
 ; GFX11-LABEL: group_image_sample:
 ; GFX11:       ; %bb.0: ; %.entry
-; GFX11-NEXT:    s_mov_b32 s24, exec_lo
+; GFX11-NEXT:    s_mov_b32 s33, exec_lo
 ; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX11-NEXT:    s_mov_b32 m0, s4
 ; GFX11-NEXT:    s_getpc_b64 s[4:5]
@@ -21,73 +21,79 @@ define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 in
 ; GFX11-NEXT:    lds_param_load v2, attr0.y wait_vdst:15
 ; GFX11-NEXT:    lds_param_load v3, attr0.x wait_vdst:15
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s16
+; GFX11-NEXT:    v_interp_p10_f32 v4, v2, v0, v2 wait_exp:1
+; GFX11-NEXT:    v_interp_p10_f32 v0, v3, v0, v3 wait_exp:0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0xf
 ; GFX11-NEXT:    s_buffer_load_b64 s[16:17], s[12:15], 0x10
 ; GFX11-NEXT:    s_buffer_load_b64 s[18:19], s[12:15], 0x20
 ; GFX11-NEXT:    s_buffer_load_b64 s[20:21], s[12:15], 0x30
 ; GFX11-NEXT:    s_buffer_load_b64 s[22:23], s[12:15], 0x40
-; GFX11-NEXT:    v_interp_p10_f32 v4, v2, v0, v2 wait_exp:1
-; GFX11-NEXT:    v_interp_p10_f32 v0, v3, v0, v3 wait_exp:0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_buffer_load_b64 s[24:25], s[12:15], 0x50
+; GFX11-NEXT:    s_buffer_load_b64 s[26:27], s[12:15], 0x60
+; GFX11-NEXT:    s_buffer_load_b64 s[28:29], s[12:15], 0x70
+; GFX11-NEXT:    s_buffer_load_b64 s[30:31], s[12:15], 0x80
+; GFX11-NEXT:    s_buffer_load_b64 s[34:35], s[12:15], 0x90
+; GFX11-NEXT:    s_buffer_load_b64 s[36:37], s[12:15], 0xa0
+; GFX11-NEXT:    s_buffer_load_b64 s[38:39], s[12:15], 0xb0
+; GFX11-NEXT:    s_buffer_load_b64 s[40:41], s[12:15], 0xc0
+; GFX11-NEXT:    s_buffer_load_b64 s[42:43], s[12:15], 0xd0
+; GFX11-NEXT:    s_buffer_load_b64 s[44:45], s[12:15], 0xe0
+; GFX11-NEXT:    s_buffer_load_b64 s[46:47], s[12:15], 0xf0
+; GFX11-NEXT:    s_buffer_load_b64 s[12:13], s[12:15], 0x100
+; GFX11-NEXT:    v_interp_p2_f32 v36, v2, v1, v4 wait_exp:7
 ; GFX11-NEXT:    v_interp_p2_f32 v0, v3, v1, v0 wait_exp:7
-; GFX11-NEXT:    v_interp_p2_f32 v1, v2, v1, v4 wait_exp:7
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v4, s16, v0 :: v_dual_add_f32 v5, s17, v1
-; GFX11-NEXT:    v_dual_add_f32 v12, s20, v0 :: v_dual_add_f32 v13, s21, v1
-; GFX11-NEXT:    v_dual_add_f32 v8, s18, v0 :: v_dual_add_f32 v9, s19, v1
-; GFX11-NEXT:    v_dual_add_f32 v16, s22, v0 :: v_dual_add_f32 v17, s23, v1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v5, s17, v36
+; GFX11-NEXT:    v_add_f32_e32 v4, s16, v0
+; GFX11-NEXT:    v_add_f32_e32 v8, s18, v0
+; GFX11-NEXT:    v_add_f32_e32 v9, s19, v36
+; GFX11-NEXT:    v_add_f32_e32 v12, s20, v0
+; GFX11-NEXT:    v_add_f32_e32 v13, s21, v36
+; GFX11-NEXT:    v_add_f32_e32 v16, s22, v0
+; GFX11-NEXT:    v_add_f32_e32 v17, s23, v36
+; GFX11-NEXT:    v_add_f32_e32 v20, s24, v0
+; GFX11-NEXT:    v_add_f32_e32 v21, s25, v36
+; GFX11-NEXT:    v_add_f32_e32 v24, s26, v0
+; GFX11-NEXT:    v_add_f32_e32 v25, s27, v36
+; GFX11-NEXT:    v_add_f32_e32 v28, s28, v0
+; GFX11-NEXT:    v_add_f32_e32 v29, s29, v36
+; GFX11-NEXT:    v_add_f32_e32 v32, s30, v0
+; GFX11-NEXT:    v_add_f32_e32 v33, s31, v36
+; GFX11-NEXT:    s_clause 0x7
 ; GFX11-NEXT:    image_sample v[4:7], v[4:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    image_sample v[8:11], v[8:9], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    image_sample v[12:15], v[12:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    image_sample v[16:19], v[16:17], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_buffer_load_b64 s[16:17], s[12:15], 0x50
-; GFX11-NEXT:    s_buffer_load_b64 s[18:19], s[12:15], 0x60
-; GFX11-NEXT:    s_buffer_load_b64 s[20:21], s[12:15], 0x70
-; GFX11-NEXT:    s_buffer_load_b64 s[22:23], s[12:15], 0x80
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_add_f32 v20, s16, v0 :: v_dual_add_f32 v21, s17, v1
-; GFX11-NEXT:    v_dual_add_f32 v28, s20, v0 :: v_dual_add_f32 v29, s21, v1
-; GFX11-NEXT:    v_dual_add_f32 v24, s18, v0 :: v_dual_add_f32 v25, s19, v1
-; GFX11-NEXT:    v_dual_add_f32 v32, s22, v0 :: v_dual_add_f32 v33, s23, v1
-; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    image_sample v[20:23], v[20:21], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    image_sample v[24:27], v[24:25], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    image_sample v[28:31], v[28:29], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    image_sample v[32:35], v[32:33], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_buffer_load_b64 s[16:17], s[12:15], 0x90
-; GFX11-NEXT:    s_buffer_load_b64 s[18:19], s[12:15], 0xa0
-; GFX11-NEXT:    s_buffer_load_b64 s[20:21], s[12:15], 0xb0
-; GFX11-NEXT:    s_buffer_load_b64 s[22:23], s[12:15], 0xc0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_add_f32 v36, s16, v0 :: v_dual_add_f32 v37, s17, v1
-; GFX11-NEXT:    v_dual_add_f32 v44, s20, v0 :: v_dual_add_f32 v45, s21, v1
-; GFX11-NEXT:    v_dual_add_f32 v40, s18, v0 :: v_dual_add_f32 v41, s19, v1
-; GFX11-NEXT:    v_dual_add_f32 v48, s22, v0 :: v_dual_add_f32 v49, s23, v1
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    image_sample v[36:39], v[36:37], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    v_add_f32_e32 v37, s34, v0
+; GFX11-NEXT:    v_add_f32_e32 v38, s35, v36
+; GFX11-NEXT:    v_add_f32_e32 v40, s36, v0
+; GFX11-NEXT:    v_add_f32_e32 v41, s37, v36
+; GFX11-NEXT:    v_add_f32_e32 v44, s38, v0
+; GFX11-NEXT:    v_add_f32_e32 v45, s39, v36
+; GFX11-NEXT:    v_add_f32_e32 v48, s40, v0
+; GFX11-NEXT:    v_add_f32_e32 v49, s41, v36
+; GFX11-NEXT:    v_add_f32_e32 v52, s42, v0
+; GFX11-NEXT:    v_add_f32_e32 v53, s43, v36
+; GFX11-NEXT:    v_add_f32_e32 v56, s44, v0
+; GFX11-NEXT:    v_add_f32_e32 v57, s45, v36
+; GFX11-NEXT:    v_add_f32_e32 v60, s46, v0
+; GFX11-NEXT:    v_add_f32_e32 v61, s47, v36
+; GFX11-NEXT:    v_add_f32_e32 v0, s12, v0
+; GFX11-NEXT:    v_add_f32_e32 v1, s13, v36
+; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s33
+; GFX11-NEXT:    s_clause 0x7
+; GFX11-NEXT:    image_sample v[36:39], v[37:38], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    image_sample v[40:43], v[40:41], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    image_sample v[44:47], v[44:45], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    image_sample v[48:51], v[48:49], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    s_buffer_load_b64 s[16:17], s[12:15], 0xd0
-; GFX11-NEXT:    s_buffer_load_b64 s[18:19], s[12:15], 0xe0
-; GFX11-NEXT:    s_buffer_load_b64 s[20:21], s[12:15], 0xf0
-; GFX11-NEXT:    s_buffer_load_b64 s[12:13], s[12:15], 0x100
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_add_f32 v52, s16, v0 :: v_dual_add_f32 v53, s17, v1
-; GFX11-NEXT:    v_dual_add_f32 v56, s18, v0 :: v_dual_add_f32 v57, s19, v1
-; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    image_sample v[52:55], v[52:53], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    image_sample v[56:59], v[56:57], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT:    v_dual_add_f32 v60, s20, v0 :: v_dual_add_f32 v61, s21, v1
-; GFX11-NEXT:    v_dual_add_f32 v0, s12, v0 :: v_dual_add_f32 v1, s13, v1
-; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s24
-; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    image_sample v[60:63], v[60:61], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    image_sample v[64:67], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    s_waitcnt vmcnt(14)
@@ -446,7 +452,7 @@ declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #3
 declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #3
 declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg) #8
 
-attributes #2 = { alwaysinline nounwind memory(readwrite) "amdgpu-sched-strategy"="max-memory-clause"}
+attributes #2 = { alwaysinline nounwind memory(readwrite) "amdgpu-sched-strategy"="max-memory-clause" "amdgpu-max-memory-cluster-dwords"="32"}
 attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 attributes #4 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
 attributes #5 = { nocallback nofree nosync nounwind willreturn memory(read) }
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
index 0f7a5f8e0941ad..35f5ac30e0593e 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
@@ -29,6 +29,7 @@
 ; CHECK-NEXT:     workItemIDX:     { reg: '$vgpr0' }
 ; CHECK-NEXT:   psInputAddr:     0
 ; CHECK-NEXT:   psInputEnable:   0
+; CHECK-NEXT:   maxMemoryClusterDWords: 0
 ; CHECK-NEXT:   mode:
 ; CHECK-NEXT:     ieee:            true
 ; CHECK-NEXT:     dx10-clamp:      true
@@ -295,6 +296,7 @@
 ; CHECK-NEXT:     workItemIDX:     { reg: '$vgpr0' }
 ; CHECK-NEXT:   psInputAddr:     0
 ; CHECK-NEXT:   psInputEnable:   0
+; CHECK-NEXT:   maxMemoryClusterDWords: 0
 ; CHECK-NEXT:   mode:
 ; CHECK-NEXT:     ieee:            true
 ; CHECK-NEXT:     dx10-clamp:      true
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
index 7759501ea42268..2782cb35ba4034 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
@@ -29,6 +29,7 @@
 ; AFTER-PEI-NEXT:   workItemIDX:     { reg: '$vgpr0' }
 ; AFTER-PEI-NEXT: psInputAddr:     0
 ; AFTER-PEI-NEXT: psInputEnable:   0
+; AFTER-PEI-NEXT: maxMemoryClusterDWords:   0
 ; AFTER-PEI-NEXT: mode:
 ; AFTER-PEI-NEXT:   ieee:            true
 ; AFTER-PEI-NEXT:   dx10-clamp:      true
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
index 4545c8bbeb3e6c..4e2efe9c9d8b1a 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@@ -30,6 +30,7 @@
 ; CHECK-NEXT: workItemIDX:     { reg: '$vgpr0' }
 ; CHECK-NEXT: psInputAddr:     0
 ; CHECK-NEXT: psInputEnable:   0
+; CHECK-NEXT: maxMemoryClusterDWords: 0
 ; CHECK-NEXT: mode:
 ; CHECK-NEXT: ieee:            true
 ; CHECK-NEXT: dx10-clamp:      true
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
index 8215ba834170f2..7559067047ac8b 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
@@ -30,6 +30,7 @@
 ; CHECK-NEXT: workItemIDX:     { reg: '$vgpr0' }
 ; CHECK-NEXT: psInputAddr:     0
 ; CHECK-NEXT: psInputEnable:   0
+; CHECK-NEXT: maxMemoryClusterDWords:   0
 ; CHECK-NEXT: mode:
 ; CHECK-NEXT: ieee:            true
 ; CHECK-NEXT: dx10-clamp:      true
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
index 51795a4fea515e..e5ffe7a2a95067 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
@@ -39,6 +39,7 @@
 # FULL-NEXT: workItemIDZ:     { reg: '$vgpr31', mask: 1072693248 }
 # FULL-NEXT: psInputAddr:     0
 # FULL-NEXT: psInputEnable:   0
+# FULL-NEXT: maxMemoryClusterDWords:   0
 # FULL-NEXT: mode:
 # FULL-NEXT: ieee: true
 # FULL-NEXT: dx10-clamp: true
@@ -143,6 +144,7 @@ body:             |
 # FULL-NEXT: workItemIDZ:     { reg: '$vgpr31', mask: 1072693248 }
 # FULL-NEXT: psInputAddr:     0
 # FULL-NEXT: psInputEnable:   0
+# FULL-NEXT: maxMemoryClusterDWords:   0
 # FULL-NEXT: mode:
 # FULL-NEXT: ieee: true
 # FULL-NEXT: dx10-clamp: true
@@ -218,6 +220,7 @@ body:             |
 # FULL-NEXT: workItemIDZ:     { reg: '$vgpr31', mask: 1072693248 }
 # FULL-NEXT: psInputAddr:     0
 # FULL-NEXT: psInputEnable:   0
+# FULL-NEXT: maxMemoryClusterDWords:   0
 # FULL-NEXT: mode:
 # FULL-NEXT: ieee: true
 # FULL-NEXT: dx10-clamp: true
@@ -294,6 +297,7 @@ body:             |
 # FULL-NEXT: workItemIDZ:     { reg: '$vgpr31', mask: 1072693248 }
 # FULL-NEXT: psInputAddr:     0
 # FULL-NEXT: psInputEnable:   0
+# FULL-NEXT: maxMemoryClusterDWords:   0
 # FULL-NEXT: mode:
 # FULL-NEXT: ieee: true
 # FULL-NEXT: dx10-clamp: true
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
index 077d22fc895ae5..7c29827beb8902 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@@ -40,6 +40,7 @@
 ; CHECK-NEXT: workItemIDZ: { reg: '$vgpr2' }
 ; CHECK-NEXT: psInputAddr: 0
 ; CHECK-NEXT: psInputEnable: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 0
 ; CHECK-NEXT: mode:
 ; CHECK-NEXT: ieee: true
 ; CHECK-NEXT: dx10-clamp: true
@@ -86,6 +87,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
 ; CHECK-NEXT: implicitBufferPtr: { reg: '$sgpr0_sgpr1' }
 ; CHECK-NEXT: psInputAddr: 1
 ; CHECK-NEXT: psInputEnable: 1
+; CHECK-NEXT: maxMemoryClusterDWords: 0
 ; CHECK-NEXT: mode:
 ; CHECK-NEXT: ieee: false
 ; CHECK-NEXT: dx10-clamp: true
@@ -156,6 +158,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
 ; CHECK-NEXT: workItemIDZ:     { reg: '$vgpr31', mask: 1072693248 }
 ; CHECK-NEXT: psInputAddr: 0
 ; CHECK-NEXT: psInputEnable: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 0
 ; CHECK-NEXT: mode:
 ; CHECK-NEXT: ieee: true
 ; CHECK-NEXT: dx10-clamp: true
@@ -208,6 +211,7 @@ define void @function() {
 ; CHECK-NEXT: workItemIDZ:     { reg: '$vgpr31', mask: 1072693248 }
 ; CHECK-NEXT: psInputAddr: 0
 ; CHECK-NEXT: psInputEnable: 0
+; CHECK-NEXT: maxMemoryClusterDWords: 0
 ; CHECK-NEXT: mode:
 ; CHECK-NEXT: ieee: true
 ; CHECK-NEXT: dx10-clamp: true



More information about the llvm-commits mailing list