[llvm] [AMDGPU] Make maximum hard clause size a subtarget feature (PR #81287)
Krzysztof Drewniak via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 9 11:58:02 PST 2024
https://github.com/krzysz00 updated https://github.com/llvm/llvm-project/pull/81287
>From e07fb958b8abefcca2d5929093b0863667038ad3 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Fri, 9 Feb 2024 16:04:00 +0000
Subject: [PATCH 1/2] [AMDGPU] Make maximum hard clause size a subtarget
feature
gfx11 chips may, in some conditions, behave incorrectly with S_CLAUSE
instructions (hard clauses) containing more than 32 operations (that
is, whose arguments exceed 0x1f). However, gfx10 targets will work
successfully with clauses of up to length 63.
Therefore, define the MaxHardClauseLength property on GCNSubtraget and
make it a subtarget feature via tablegen, thus allowing us to specify,
both now and in the future, the maximum viable size of clauses on
variosu hardware from the tablegen definition. If MaxHardClauseLength
is 0, which is the default, the hardware does not support hard
clauses.
---
llvm/lib/Target/AMDGPU/AMDGPU.td | 25 +-
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 11 +-
.../lib/Target/AMDGPU/SIInsertHardClauses.cpp | 9 +-
llvm/test/CodeGen/AMDGPU/bf16.ll | 2 +-
llvm/test/CodeGen/AMDGPU/function-args.ll | 2 +-
llvm/test/CodeGen/AMDGPU/hard-clauses.mir | 28 +-
.../CodeGen/AMDGPU/max-hard-clause-length.ll | 1418 +++++++++++++++++
llvm/test/CodeGen/AMDGPU/select.f16.ll | 2 +-
8 files changed, 1478 insertions(+), 19 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 55dbc1a803e13c..09136f8167bf10 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -227,6 +227,22 @@ def FeatureLdsBranchVmemWARHazard : SubtargetFeature<"lds-branch-vmem-war-hazard
"Switching between LDS and VMEM-tex not waiting VM_VSRC=0"
>;
+class FeatureMaxHardClauseLength<int size> : SubtargetFeature<
+ "max-hard-clause-length-"#size,
+ "MaxHardClauseLength",
+ !cast<string>(size),
+ "Maximum number of instructions in an explicit S_CLAUSE is "#size
+>;
+
+def FeatureMaxHardClauseLength32 : FeatureMaxHardClauseLength<32>;
+def FeatureMaxHardClauseLength63 : FeatureMaxHardClauseLength<63>;
+def FeatureNoHardClauses : SubtargetFeature<
+ "no-hard-clauses",
+ "MaxHardClauseLength",
+ "0",
+ "S_CLAUSE instructions may not be used"
+>;
+
def FeatureNSAtoVMEMBug : SubtargetFeature<"nsa-to-vmem-bug",
"HasNSAtoVMEMBug",
"true",
@@ -1086,7 +1102,8 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts,
- FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
+ FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
+ FeatureMaxHardClauseLength63
]
>;
@@ -1106,7 +1123,8 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
FeatureA16, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
- FeatureGWS, FeatureDefaultComponentZero
+ FeatureGWS, FeatureDefaultComponentZero,
+ FeatureMaxHardClauseLength32
]
>;
@@ -1126,7 +1144,8 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
FeatureA16, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
- FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast
+ FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast,
+ FeatureMaxHardClauseLength32
]
>;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 4f8eeaaf500b4d..7d3f64e491cd0b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -168,6 +168,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasFlatAtomicFaddF32Inst = false;
bool HasDefaultComponentZero = false;
bool HasDefaultComponentBroadcast = false;
+ /// The maximum number of instructions that may be placed within an S_CLAUSE.
+ /// , which is one greater than the maximum argument to S_CLAUSE. A value
+ /// of 0 indicates a lack of S_CLAUSE support.
+ unsigned MaxHardClauseLength = 0;
bool SupportsSRAMECC = false;
// This should not be used directly. 'TargetID' tracks the dynamic settings
@@ -1143,7 +1147,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasNSAClauseBug() const { return HasNSAClauseBug; }
- bool hasHardClauses() const { return getGeneration() >= GFX10; }
+ bool hasHardClauses() const { return MaxHardClauseLength > 0; }
bool hasGFX90AInsts() const { return GFX90AInsts; }
@@ -1208,6 +1212,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
/// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
+ /// \returns The maximum number of instructions that can be enclosed in an
+ /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
+ /// instruction.
+ unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
+
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
/// SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
index 442ae4dd7b34fe..46dee9d6d04e8b 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -43,11 +43,6 @@ using namespace llvm;
namespace {
-// A clause length of 64 instructions could be encoded in the s_clause
-// instruction, but the hardware documentation (at least for GFX11) says that
-// 63 is the maximum allowed.
-constexpr unsigned MaxInstructionsInClause = 63;
-
enum HardClauseType {
// For GFX10:
@@ -182,7 +177,7 @@ class SIInsertHardClauses : public MachineFunctionPass {
bool emitClause(const ClauseInfo &CI, const SIInstrInfo *SII) {
if (CI.First == CI.Last)
return false;
- assert(CI.Length <= MaxInstructionsInClause && "Hard clause is too long!");
+ assert(CI.Length <= ST->maxHardClauseLength() && "Hard clause is too long!");
auto &MBB = *CI.First->getParent();
auto ClauseMI =
@@ -223,7 +218,7 @@ class SIInsertHardClauses : public MachineFunctionPass {
}
}
- if (CI.Length == MaxInstructionsInClause ||
+ if (CI.Length == ST->maxHardClauseLength() ||
(CI.Length && Type != HARDCLAUSE_INTERNAL &&
Type != HARDCLAUSE_IGNORE &&
(Type != CI.Type ||
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 3c66c83042951b..387c4a16a008ae 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -28974,7 +28974,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX11-LABEL: v_vselect_v32bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x20
+; GFX11-NEXT: s_clause 0x1f
; GFX11-NEXT: scratch_load_u16 v31, off, s32
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:128
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:64
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 4963dc517574d7..38bfee961dd296 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -4037,7 +4037,7 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
; GFX11-LABEL: void_func_v32i32_v16i32_v16f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x20
+; GFX11-NEXT: s_clause 0x1f
; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:80
; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:76
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:72
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
index d019addb551a3a..1c6bdff51015ee 100644
--- a/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
@@ -13,6 +13,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; CHECK-NEXT: S_NOP 2
+ ;
; GFX11-LABEL: name: nop1
; GFX11: liveins: $sgpr0_sgpr1
; GFX11-NEXT: {{ $}}
@@ -37,6 +38,7 @@ body: |
; CHECK-NEXT: S_NOP 2
; CHECK-NEXT: $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
; CHECK-NEXT: }
+ ;
; GFX11-LABEL: name: nop2
; GFX11: liveins: $sgpr0_sgpr1
; GFX11-NEXT: {{ $}}
@@ -67,6 +69,7 @@ body: |
; CHECK-NEXT: $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
; CHECK-NEXT: }
; CHECK-NEXT: S_NOP 2
+ ;
; GFX11-LABEL: name: nop3
; GFX11: liveins: $sgpr0_sgpr1
; GFX11-NEXT: {{ $}}
@@ -178,11 +181,12 @@ body: |
; CHECK-NEXT: $vgpr79 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 316, 0, 0, implicit $exec
; CHECK-NEXT: $vgpr80 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 320, 0, 0, implicit $exec
; CHECK-NEXT: }
+ ;
; GFX11-LABEL: name: long_clause
; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0
; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit-def $vgpr4, implicit-def $vgpr4_lo16, implicit-def $vgpr4_hi16, implicit-def $vgpr5, implicit-def $vgpr5_lo16, implicit-def $vgpr5_hi16, implicit-def $vgpr6, implicit-def $vgpr6_lo16, implicit-def $vgpr6_hi16, implicit-def $vgpr7, implicit-def $vgpr7_lo16, implicit-def $vgpr7_hi16, implicit-def $vgpr8, implicit-def $vgpr8_lo16, implicit-def $vgpr8_hi16, implicit-def $vgpr9, implicit-def $vgpr9_lo16, implicit-def $vgpr9_hi16, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr13, implicit-def $vgpr13_lo16, implicit-def $vgpr13_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit-def $vgpr17, implicit-def $vgpr17_lo16, implicit-def $vgpr17_hi16, implicit-def $vgpr18, implicit-def $vgpr18_lo16, implicit-def $vgpr18_hi16, implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr23, implicit-def $vgpr23_lo16, implicit-def $vgpr23_hi16, implicit-def $vgpr24, implicit-def $vgpr24_lo16, implicit-def $vgpr24_hi16, implicit-def $vgpr25, implicit-def $vgpr25_lo16, implicit-def $vgpr25_hi16, implicit-def $vgpr26, implicit-def $vgpr26_lo16, implicit-def $vgpr26_hi16, implicit-def $vgpr27, implicit-def $vgpr27_lo16, implicit-def $vgpr27_hi16, implicit-def $vgpr28, implicit-def $vgpr28_lo16, implicit-def $vgpr28_hi16, implicit-def $vgpr29, implicit-def $vgpr29_lo16, implicit-def $vgpr29_hi16, implicit-def $vgpr30, implicit-def $vgpr30_lo16, implicit-def $vgpr30_hi16, implicit-def $vgpr31, implicit-def $vgpr31_lo16, implicit-def $vgpr31_hi16, implicit-def $vgpr32, implicit-def $vgpr32_lo16, implicit-def $vgpr32_hi16, implicit-def $vgpr33, implicit-def $vgpr33_lo16, implicit-def $vgpr33_hi16, implicit-def $vgpr34, implicit-def $vgpr34_lo16, implicit-def $vgpr34_hi16, implicit-def $vgpr35, implicit-def $vgpr35_lo16, implicit-def $vgpr35_hi16, implicit-def $vgpr36, implicit-def $vgpr36_lo16, implicit-def $vgpr36_hi16, implicit-def $vgpr37, implicit-def $vgpr37_lo16, implicit-def $vgpr37_hi16, implicit-def $vgpr38, implicit-def $vgpr38_lo16, implicit-def $vgpr38_hi16, implicit-def $vgpr39, implicit-def $vgpr39_lo16, implicit-def $vgpr39_hi16, implicit-def $vgpr40, implicit-def $vgpr40_lo16, implicit-def $vgpr40_hi16, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
- ; GFX11-NEXT: S_CLAUSE 62
+ ; GFX11-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit-def $vgpr4, implicit-def $vgpr4_lo16, implicit-def $vgpr4_hi16, implicit-def $vgpr5, implicit-def $vgpr5_lo16, implicit-def $vgpr5_hi16, implicit-def $vgpr6, implicit-def $vgpr6_lo16, implicit-def $vgpr6_hi16, implicit-def $vgpr7, implicit-def $vgpr7_lo16, implicit-def $vgpr7_hi16, implicit-def $vgpr8, implicit-def $vgpr8_lo16, implicit-def $vgpr8_hi16, implicit-def $vgpr9, implicit-def $vgpr9_lo16, implicit-def $vgpr9_hi16, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr13, implicit-def $vgpr13_lo16, implicit-def $vgpr13_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit-def $vgpr17, implicit-def $vgpr17_lo16, implicit-def $vgpr17_hi16, implicit-def $vgpr18, implicit-def $vgpr18_lo16, implicit-def $vgpr18_hi16, implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr23, implicit-def $vgpr23_lo16, implicit-def $vgpr23_hi16, implicit-def $vgpr24, implicit-def $vgpr24_lo16, implicit-def $vgpr24_hi16, implicit-def $vgpr25, implicit-def $vgpr25_lo16, implicit-def $vgpr25_hi16, implicit-def $vgpr26, implicit-def $vgpr26_lo16, implicit-def $vgpr26_hi16, implicit-def $vgpr27, implicit-def $vgpr27_lo16, implicit-def $vgpr27_hi16, implicit-def $vgpr28, implicit-def $vgpr28_lo16, implicit-def $vgpr28_hi16, implicit-def $vgpr29, implicit-def $vgpr29_lo16, implicit-def $vgpr29_hi16, implicit-def $vgpr30, implicit-def $vgpr30_lo16, implicit-def $vgpr30_hi16, implicit-def $vgpr31, implicit-def $vgpr31_lo16, implicit-def $vgpr31_hi16, implicit-def $vgpr32, implicit-def $vgpr32_lo16, implicit-def $vgpr32_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-NEXT: S_CLAUSE 31
; GFX11-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, implicit $exec
@@ -215,6 +219,9 @@ body: |
; GFX11-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr32 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, implicit $exec
+ ; GFX11-NEXT: }
+ ; GFX11-NEXT: BUNDLE implicit-def $vgpr33, implicit-def $vgpr33_lo16, implicit-def $vgpr33_hi16, implicit-def $vgpr34, implicit-def $vgpr34_lo16, implicit-def $vgpr34_hi16, implicit-def $vgpr35, implicit-def $vgpr35_lo16, implicit-def $vgpr35_hi16, implicit-def $vgpr36, implicit-def $vgpr36_lo16, implicit-def $vgpr36_hi16, implicit-def $vgpr37, implicit-def $vgpr37_lo16, implicit-def $vgpr37_hi16, implicit-def $vgpr38, implicit-def $vgpr38_lo16, implicit-def $vgpr38_hi16, implicit-def $vgpr39, implicit-def $vgpr39_lo16, implicit-def $vgpr39_hi16, implicit-def $vgpr40, implicit-def $vgpr40_lo16, implicit-def $vgpr40_hi16, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-NEXT: S_CLAUSE 31
; GFX11-NEXT: $vgpr33 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr34 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr35 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 140, 0, 0, implicit $exec
@@ -246,10 +253,10 @@ body: |
; GFX11-NEXT: $vgpr61 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 244, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr62 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 248, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 252, 0, 0, implicit $exec
- ; GFX11-NEXT: }
- ; GFX11-NEXT: BUNDLE implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
- ; GFX11-NEXT: S_CLAUSE 16
; GFX11-NEXT: $vgpr64 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec
+ ; GFX11-NEXT: }
+ ; GFX11-NEXT: BUNDLE implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-NEXT: S_CLAUSE 15
; GFX11-NEXT: $vgpr65 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 260, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr66 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 264, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr67 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 268, 0, 0, implicit $exec
@@ -360,6 +367,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
; CHECK-NEXT: $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+ ;
; GFX11-LABEL: name: mimg_nsa
; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; GFX11-NEXT: {{ $}}
@@ -384,6 +392,7 @@ body: |
; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
; CHECK-NEXT: $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
; CHECK-NEXT: $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+ ;
; GFX11-LABEL: name: mimg_nsa_mixed
; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; GFX11-NEXT: {{ $}}
@@ -413,6 +422,7 @@ body: |
; CHECK-NEXT: KILL undef renamable $sgpr4
; CHECK-NEXT: $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
; CHECK-NEXT: }
+ ;
; GFX11-LABEL: name: kill
; GFX11: liveins: $sgpr0_sgpr1, $sgpr4
; GFX11-NEXT: {{ $}}
@@ -443,6 +453,7 @@ body: |
; CHECK-NEXT: $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
; CHECK-NEXT: }
; CHECK-NEXT: KILL undef renamable $sgpr5
+ ;
; GFX11-LABEL: name: kill2
; GFX11: liveins: $sgpr0_sgpr1, $sgpr4, $sgpr5
; GFX11-NEXT: {{ $}}
@@ -473,6 +484,7 @@ body: |
; CHECK-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; CHECK-NEXT: $vgpr4 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
; CHECK-NEXT: }
+ ;
; GFX11-LABEL: name: flat_load_atomic
; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
; GFX11-NEXT: {{ $}}
@@ -496,6 +508,7 @@ body: |
; CHECK-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
; CHECK-NEXT: $vgpr4 = GLOBAL_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
; CHECK-NEXT: }
+ ;
; GFX11-LABEL: name: global_load_atomic
; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
; GFX11-NEXT: {{ $}}
@@ -516,6 +529,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; CHECK-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec, implicit $flat_scr
+ ;
; GFX11-LABEL: name: flat_global_load
; GFX11: liveins: $vgpr0_vgpr1
; GFX11-NEXT: {{ $}}
@@ -539,6 +553,7 @@ body: |
; CHECK-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
; CHECK-NEXT: $vgpr0 = BUFFER_ATOMIC_ADD_OFFSET_RTN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 4, 0, 0, implicit $exec
; CHECK-NEXT: }
+ ;
; GFX11-LABEL: name: buffer_load_atomic
; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0
; GFX11-NEXT: {{ $}}
@@ -559,6 +574,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; CHECK-NEXT: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
+ ;
; GFX11-LABEL: name: flat_load_store
; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
; GFX11-NEXT: {{ $}}
@@ -579,6 +595,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
; CHECK-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
+ ;
; GFX11-LABEL: name: global_load_store
; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
; GFX11-NEXT: {{ $}}
@@ -599,6 +616,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec
+ ;
; GFX11-LABEL: name: buffer_load_store
; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0
; GFX11-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll
new file mode 100644
index 00000000000000..2b5d32fa7b9776
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll
@@ -0,0 +1,1418 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefixes=GFX10
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s -check-prefix=GFX11
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefix=GFX12
+
+;; Check that consecutive store operations are grouped greedily into
+;; hard clauses of the appropriate length for each target.
+;; This test uses <4 x i32> stores in order to prevent the stores from
+;; being combined into larger operations due to their adjecency.
+define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) {
+; GFX10-LABEL: long_store_chain:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: s_mov_b32 s1, s0
+; GFX10-NEXT: s_mov_b32 s2, s0
+; GFX10-NEXT: s_mov_b32 s3, s0
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:16
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:32
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:48
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:64
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:80
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:96
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:112
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:128
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:144
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:160
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:176
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:192
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:208
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:224
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:240
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:256
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:272
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:288
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:304
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:320
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:336
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:352
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:368
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:384
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:400
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:416
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:432
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:448
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:464
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:480
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:496
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:512
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:528
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:544
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:560
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:576
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:592
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:608
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:624
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:640
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:656
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:672
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:688
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:704
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:720
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:736
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:752
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:768
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:784
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:800
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:816
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:832
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:848
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:864
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:880
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:896
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:912
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:928
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:944
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:960
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:976
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:992
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:1008
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:1024
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] offset:1040
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: long_store_chain:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b32 s1, s0
+; GFX11-NEXT: s_mov_b32 s2, s0
+; GFX11-NEXT: s_mov_b32 s3, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5]
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:16
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:32
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:48
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:64
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:80
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:96
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:112
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:128
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:144
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:160
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:176
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:192
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:208
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:224
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:240
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:256
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:272
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:288
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:304
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:320
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:336
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:352
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:368
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:384
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:400
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:416
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:432
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:448
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:464
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:480
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:496
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:512
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:528
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:544
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:560
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:576
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:592
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:608
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:624
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:640
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:656
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:672
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:688
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:704
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:720
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:736
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:752
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:768
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:784
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:800
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:816
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:832
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:848
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:864
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:880
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:896
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:912
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:928
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:944
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:960
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:976
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:992
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:1008
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:1024
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:1040
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: long_store_chain:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_mov_b32 s1, s0
+; GFX12-NEXT: s_mov_b32 s2, s0
+; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_clause 0x1f
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:96
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:128
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:144
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:160
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:176
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:192
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:208
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:224
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:240
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:256
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:272
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:288
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:304
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:320
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:336
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:352
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:368
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:384
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:400
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:416
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:432
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:448
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:464
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:480
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:496
+; GFX12-NEXT: s_clause 0x1f
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:512
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:528
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:544
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:560
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:576
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:592
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:608
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:624
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:640
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:656
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:672
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:688
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:704
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:720
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:736
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:752
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:768
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:784
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:800
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:816
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:832
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:848
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:864
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:880
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:896
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:912
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:928
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:944
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:960
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:976
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:992
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:1008
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:1024
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5] offset:1040
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %p
+ %ptr1 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 1
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr1
+ %ptr2 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 2
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr2
+ %ptr3 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 3
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr3
+ %ptr4 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 4
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr4
+ %ptr5 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 5
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr5
+ %ptr6 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 6
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr6
+ %ptr7 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 7
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr7
+ %ptr8 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 8
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr8
+ %ptr9 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 9
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr9
+ %ptr10 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 10
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr10
+ %ptr11 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 11
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr11
+ %ptr12 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 12
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr12
+ %ptr13 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 13
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr13
+ %ptr14 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 14
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr14
+ %ptr15 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 15
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr15
+ %ptr16 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 16
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr16
+ %ptr17 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 17
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr17
+ %ptr18 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 18
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr18
+ %ptr19 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 19
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr19
+ %ptr20 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 20
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr20
+ %ptr21 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 21
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr21
+ %ptr22 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 22
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr22
+ %ptr23 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 23
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr23
+ %ptr24 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 24
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr24
+ %ptr25 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 25
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr25
+ %ptr26 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 26
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr26
+ %ptr27 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 27
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr27
+ %ptr28 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 28
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr28
+ %ptr29 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 29
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr29
+ %ptr30 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 30
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr30
+ %ptr31 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 31
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr31
+ %ptr32 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 32
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr32
+ %ptr33 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 33
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr33
+ %ptr34 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 34
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr34
+ %ptr35 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 35
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr35
+ %ptr36 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 36
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr36
+ %ptr37 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 37
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr37
+ %ptr38 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 38
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr38
+ %ptr39 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 39
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr39
+ %ptr40 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 40
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr40
+ %ptr41 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 41
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr41
+ %ptr42 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 42
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr42
+ %ptr43 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 43
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr43
+ %ptr44 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 44
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr44
+ %ptr45 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 45
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr45
+ %ptr46 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 46
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr46
+ %ptr47 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 47
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr47
+ %ptr48 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 48
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr48
+ %ptr49 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 49
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr49
+ %ptr50 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 50
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr50
+ %ptr51 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 51
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr51
+ %ptr52 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 52
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr52
+ %ptr53 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 53
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr53
+ %ptr54 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 54
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr54
+ %ptr55 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 55
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr55
+ %ptr56 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 56
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr56
+ %ptr57 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 57
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr57
+ %ptr58 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 58
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr58
+ %ptr59 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 59
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr59
+ %ptr60 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 60
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr60
+ %ptr61 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 61
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr61
+ %ptr62 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 62
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr62
+ %ptr63 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 63
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr63
+ %ptr64 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 64
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr64
+ %ptr65 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 65
+ store <4 x i32> zeroinitializer, ptr addrspace(1) %ptr65
+ ret void
+}
+
+;; Long chain of loads since gfx10 doesn't cluster stores.
+;; Use i32 loads to save on register pressure
+define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) {
+; GFX10-LABEL: long_load_chain:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x3e
+; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-NEXT: s_load_dword s3, s[0:1], 0x10
+; GFX10-NEXT: s_load_dword s4, s[0:1], 0x20
+; GFX10-NEXT: s_load_dword s5, s[0:1], 0x30
+; GFX10-NEXT: s_load_dword s6, s[0:1], 0x40
+; GFX10-NEXT: s_load_dword s7, s[0:1], 0x50
+; GFX10-NEXT: s_load_dword s8, s[0:1], 0x60
+; GFX10-NEXT: s_load_dword s9, s[0:1], 0x70
+; GFX10-NEXT: s_load_dword s10, s[0:1], 0x80
+; GFX10-NEXT: s_load_dword s11, s[0:1], 0x90
+; GFX10-NEXT: s_load_dword s12, s[0:1], 0xa0
+; GFX10-NEXT: s_load_dword s13, s[0:1], 0xb0
+; GFX10-NEXT: s_load_dword s14, s[0:1], 0xc0
+; GFX10-NEXT: s_load_dword s15, s[0:1], 0xd0
+; GFX10-NEXT: s_load_dword s16, s[0:1], 0xe0
+; GFX10-NEXT: s_load_dword s17, s[0:1], 0xf0
+; GFX10-NEXT: s_load_dword s18, s[0:1], 0x100
+; GFX10-NEXT: s_load_dword s19, s[0:1], 0x110
+; GFX10-NEXT: s_load_dword s20, s[0:1], 0x120
+; GFX10-NEXT: s_load_dword s21, s[0:1], 0x130
+; GFX10-NEXT: s_load_dword s22, s[0:1], 0x140
+; GFX10-NEXT: s_load_dword s23, s[0:1], 0x150
+; GFX10-NEXT: s_load_dword s24, s[0:1], 0x160
+; GFX10-NEXT: s_load_dword s25, s[0:1], 0x170
+; GFX10-NEXT: s_load_dword s26, s[0:1], 0x180
+; GFX10-NEXT: s_load_dword s27, s[0:1], 0x190
+; GFX10-NEXT: s_load_dword s28, s[0:1], 0x1a0
+; GFX10-NEXT: s_load_dword s29, s[0:1], 0x1b0
+; GFX10-NEXT: s_load_dword s30, s[0:1], 0x1c0
+; GFX10-NEXT: s_load_dword s31, s[0:1], 0x1d0
+; GFX10-NEXT: s_load_dword s33, s[0:1], 0x1e0
+; GFX10-NEXT: s_load_dword s34, s[0:1], 0x1f0
+; GFX10-NEXT: s_load_dword s35, s[0:1], 0x200
+; GFX10-NEXT: s_load_dword s36, s[0:1], 0x210
+; GFX10-NEXT: s_load_dword s37, s[0:1], 0x220
+; GFX10-NEXT: s_load_dword s38, s[0:1], 0x230
+; GFX10-NEXT: s_load_dword s39, s[0:1], 0x240
+; GFX10-NEXT: s_load_dword s40, s[0:1], 0x250
+; GFX10-NEXT: s_load_dword s41, s[0:1], 0x260
+; GFX10-NEXT: s_load_dword s42, s[0:1], 0x270
+; GFX10-NEXT: s_load_dword s43, s[0:1], 0x280
+; GFX10-NEXT: s_load_dword s44, s[0:1], 0x290
+; GFX10-NEXT: s_load_dword s45, s[0:1], 0x2a0
+; GFX10-NEXT: s_load_dword s46, s[0:1], 0x2b0
+; GFX10-NEXT: s_load_dword s47, s[0:1], 0x2c0
+; GFX10-NEXT: s_load_dword s48, s[0:1], 0x2d0
+; GFX10-NEXT: s_load_dword s49, s[0:1], 0x2e0
+; GFX10-NEXT: s_load_dword s50, s[0:1], 0x2f0
+; GFX10-NEXT: s_load_dword s51, s[0:1], 0x300
+; GFX10-NEXT: s_load_dword s52, s[0:1], 0x310
+; GFX10-NEXT: s_load_dword s53, s[0:1], 0x320
+; GFX10-NEXT: s_load_dword s54, s[0:1], 0x330
+; GFX10-NEXT: s_load_dword s55, s[0:1], 0x340
+; GFX10-NEXT: s_load_dword s56, s[0:1], 0x350
+; GFX10-NEXT: s_load_dword s57, s[0:1], 0x360
+; GFX10-NEXT: s_load_dword s58, s[0:1], 0x370
+; GFX10-NEXT: s_load_dword s59, s[0:1], 0x380
+; GFX10-NEXT: s_load_dword s60, s[0:1], 0x390
+; GFX10-NEXT: s_load_dword s61, s[0:1], 0x3a0
+; GFX10-NEXT: s_load_dword s62, s[0:1], 0x3b0
+; GFX10-NEXT: s_load_dword s63, s[0:1], 0x3c0
+; GFX10-NEXT: s_load_dword s64, s[0:1], 0x3d0
+; GFX10-NEXT: s_load_dword s65, s[0:1], 0x3e0
+; GFX10-NEXT: s_clause 0x2
+; GFX10-NEXT: s_load_dword s66, s[0:1], 0x3f0
+; GFX10-NEXT: s_load_dword s67, s[0:1], 0x400
+; GFX10-NEXT: s_load_dword s0, s[0:1], 0x410
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s2
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s3
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s4
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s5
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s6
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s7
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s8
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s9
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s10
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s11
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s12
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s13
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s14
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s15
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s16
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s17
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s18
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s19
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s20
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s21
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s22
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s23
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s24
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s25
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s26
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s27
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s28
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s29
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s30
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s31
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s33
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s34
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s35
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s36
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s37
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s38
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s39
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s40
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s41
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s42
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s43
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s44
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s45
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s46
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s47
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s48
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s49
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s50
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s51
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s52
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s53
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s54
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s55
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s56
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s57
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s58
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s59
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s60
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s61
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s62
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s63
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s64
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s65
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s66
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s67
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: ;;#ASMSTART
+; GFX10-NEXT: ; use s0
+; GFX10-NEXT: ;;#ASMEND
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: long_load_chain:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x10
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x20
+; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x30
+; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x40
+; GFX11-NEXT: s_load_b32 s7, s[0:1], 0x50
+; GFX11-NEXT: s_load_b32 s8, s[0:1], 0x60
+; GFX11-NEXT: s_load_b32 s9, s[0:1], 0x70
+; GFX11-NEXT: s_load_b32 s10, s[0:1], 0x80
+; GFX11-NEXT: s_load_b32 s11, s[0:1], 0x90
+; GFX11-NEXT: s_load_b32 s12, s[0:1], 0xa0
+; GFX11-NEXT: s_load_b32 s13, s[0:1], 0xb0
+; GFX11-NEXT: s_load_b32 s14, s[0:1], 0xc0
+; GFX11-NEXT: s_load_b32 s15, s[0:1], 0xd0
+; GFX11-NEXT: s_load_b32 s16, s[0:1], 0xe0
+; GFX11-NEXT: s_load_b32 s17, s[0:1], 0xf0
+; GFX11-NEXT: s_load_b32 s18, s[0:1], 0x100
+; GFX11-NEXT: s_load_b32 s19, s[0:1], 0x110
+; GFX11-NEXT: s_load_b32 s20, s[0:1], 0x120
+; GFX11-NEXT: s_load_b32 s21, s[0:1], 0x130
+; GFX11-NEXT: s_load_b32 s22, s[0:1], 0x140
+; GFX11-NEXT: s_load_b32 s23, s[0:1], 0x150
+; GFX11-NEXT: s_load_b32 s24, s[0:1], 0x160
+; GFX11-NEXT: s_load_b32 s25, s[0:1], 0x170
+; GFX11-NEXT: s_load_b32 s26, s[0:1], 0x180
+; GFX11-NEXT: s_load_b32 s27, s[0:1], 0x190
+; GFX11-NEXT: s_load_b32 s28, s[0:1], 0x1a0
+; GFX11-NEXT: s_load_b32 s29, s[0:1], 0x1b0
+; GFX11-NEXT: s_load_b32 s30, s[0:1], 0x1c0
+; GFX11-NEXT: s_load_b32 s31, s[0:1], 0x1d0
+; GFX11-NEXT: s_load_b32 s33, s[0:1], 0x1e0
+; GFX11-NEXT: s_load_b32 s34, s[0:1], 0x1f0
+; GFX11-NEXT: s_clause 0x1f
+; GFX11-NEXT: s_load_b32 s35, s[0:1], 0x200
+; GFX11-NEXT: s_load_b32 s36, s[0:1], 0x210
+; GFX11-NEXT: s_load_b32 s37, s[0:1], 0x220
+; GFX11-NEXT: s_load_b32 s38, s[0:1], 0x230
+; GFX11-NEXT: s_load_b32 s39, s[0:1], 0x240
+; GFX11-NEXT: s_load_b32 s40, s[0:1], 0x250
+; GFX11-NEXT: s_load_b32 s41, s[0:1], 0x260
+; GFX11-NEXT: s_load_b32 s42, s[0:1], 0x270
+; GFX11-NEXT: s_load_b32 s43, s[0:1], 0x280
+; GFX11-NEXT: s_load_b32 s44, s[0:1], 0x290
+; GFX11-NEXT: s_load_b32 s45, s[0:1], 0x2a0
+; GFX11-NEXT: s_load_b32 s46, s[0:1], 0x2b0
+; GFX11-NEXT: s_load_b32 s47, s[0:1], 0x2c0
+; GFX11-NEXT: s_load_b32 s48, s[0:1], 0x2d0
+; GFX11-NEXT: s_load_b32 s49, s[0:1], 0x2e0
+; GFX11-NEXT: s_load_b32 s50, s[0:1], 0x2f0
+; GFX11-NEXT: s_load_b32 s51, s[0:1], 0x300
+; GFX11-NEXT: s_load_b32 s52, s[0:1], 0x310
+; GFX11-NEXT: s_load_b32 s53, s[0:1], 0x320
+; GFX11-NEXT: s_load_b32 s54, s[0:1], 0x330
+; GFX11-NEXT: s_load_b32 s55, s[0:1], 0x340
+; GFX11-NEXT: s_load_b32 s56, s[0:1], 0x350
+; GFX11-NEXT: s_load_b32 s57, s[0:1], 0x360
+; GFX11-NEXT: s_load_b32 s58, s[0:1], 0x370
+; GFX11-NEXT: s_load_b32 s59, s[0:1], 0x380
+; GFX11-NEXT: s_load_b32 s60, s[0:1], 0x390
+; GFX11-NEXT: s_load_b32 s61, s[0:1], 0x3a0
+; GFX11-NEXT: s_load_b32 s62, s[0:1], 0x3b0
+; GFX11-NEXT: s_load_b32 s63, s[0:1], 0x3c0
+; GFX11-NEXT: s_load_b32 s64, s[0:1], 0x3d0
+; GFX11-NEXT: s_load_b32 s65, s[0:1], 0x3e0
+; GFX11-NEXT: s_load_b32 s66, s[0:1], 0x3f0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s67, s[0:1], 0x400
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x410
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s2
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s3
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s4
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s5
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s6
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s7
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s8
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s9
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s10
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s11
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s12
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s13
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s14
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s15
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s16
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s17
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s18
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s19
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s20
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s21
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s22
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s23
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s24
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s25
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s26
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s27
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s28
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s29
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s30
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s31
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s33
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s34
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s35
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s36
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s37
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s38
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s39
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s40
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s41
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s42
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s43
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s44
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s45
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s46
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s47
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s48
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s49
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s50
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s51
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s52
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s53
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s54
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s55
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s56
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s57
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s58
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s59
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s60
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s61
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s62
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s63
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s64
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s65
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s66
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s67
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s0
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: long_load_chain:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_clause 0x1f
+; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x10
+; GFX12-NEXT: s_load_b32 s4, s[0:1], 0x20
+; GFX12-NEXT: s_load_b32 s5, s[0:1], 0x30
+; GFX12-NEXT: s_load_b32 s6, s[0:1], 0x40
+; GFX12-NEXT: s_load_b32 s7, s[0:1], 0x50
+; GFX12-NEXT: s_load_b32 s8, s[0:1], 0x60
+; GFX12-NEXT: s_load_b32 s9, s[0:1], 0x70
+; GFX12-NEXT: s_load_b32 s10, s[0:1], 0x80
+; GFX12-NEXT: s_load_b32 s11, s[0:1], 0x90
+; GFX12-NEXT: s_load_b32 s12, s[0:1], 0xa0
+; GFX12-NEXT: s_load_b32 s13, s[0:1], 0xb0
+; GFX12-NEXT: s_load_b32 s14, s[0:1], 0xc0
+; GFX12-NEXT: s_load_b32 s15, s[0:1], 0xd0
+; GFX12-NEXT: s_load_b32 s16, s[0:1], 0xe0
+; GFX12-NEXT: s_load_b32 s17, s[0:1], 0xf0
+; GFX12-NEXT: s_load_b32 s18, s[0:1], 0x100
+; GFX12-NEXT: s_load_b32 s19, s[0:1], 0x110
+; GFX12-NEXT: s_load_b32 s20, s[0:1], 0x120
+; GFX12-NEXT: s_load_b32 s21, s[0:1], 0x130
+; GFX12-NEXT: s_load_b32 s22, s[0:1], 0x140
+; GFX12-NEXT: s_load_b32 s23, s[0:1], 0x150
+; GFX12-NEXT: s_load_b32 s24, s[0:1], 0x160
+; GFX12-NEXT: s_load_b32 s25, s[0:1], 0x170
+; GFX12-NEXT: s_load_b32 s26, s[0:1], 0x180
+; GFX12-NEXT: s_load_b32 s27, s[0:1], 0x190
+; GFX12-NEXT: s_load_b32 s28, s[0:1], 0x1a0
+; GFX12-NEXT: s_load_b32 s29, s[0:1], 0x1b0
+; GFX12-NEXT: s_load_b32 s30, s[0:1], 0x1c0
+; GFX12-NEXT: s_load_b32 s31, s[0:1], 0x1d0
+; GFX12-NEXT: s_load_b32 s33, s[0:1], 0x1e0
+; GFX12-NEXT: s_load_b32 s34, s[0:1], 0x1f0
+; GFX12-NEXT: s_clause 0x1f
+; GFX12-NEXT: s_load_b32 s35, s[0:1], 0x200
+; GFX12-NEXT: s_load_b32 s36, s[0:1], 0x210
+; GFX12-NEXT: s_load_b32 s37, s[0:1], 0x220
+; GFX12-NEXT: s_load_b32 s38, s[0:1], 0x230
+; GFX12-NEXT: s_load_b32 s39, s[0:1], 0x240
+; GFX12-NEXT: s_load_b32 s40, s[0:1], 0x250
+; GFX12-NEXT: s_load_b32 s41, s[0:1], 0x260
+; GFX12-NEXT: s_load_b32 s42, s[0:1], 0x270
+; GFX12-NEXT: s_load_b32 s43, s[0:1], 0x280
+; GFX12-NEXT: s_load_b32 s44, s[0:1], 0x290
+; GFX12-NEXT: s_load_b32 s45, s[0:1], 0x2a0
+; GFX12-NEXT: s_load_b32 s46, s[0:1], 0x2b0
+; GFX12-NEXT: s_load_b32 s47, s[0:1], 0x2c0
+; GFX12-NEXT: s_load_b32 s48, s[0:1], 0x2d0
+; GFX12-NEXT: s_load_b32 s49, s[0:1], 0x2e0
+; GFX12-NEXT: s_load_b32 s50, s[0:1], 0x2f0
+; GFX12-NEXT: s_load_b32 s51, s[0:1], 0x300
+; GFX12-NEXT: s_load_b32 s52, s[0:1], 0x310
+; GFX12-NEXT: s_load_b32 s53, s[0:1], 0x320
+; GFX12-NEXT: s_load_b32 s54, s[0:1], 0x330
+; GFX12-NEXT: s_load_b32 s55, s[0:1], 0x340
+; GFX12-NEXT: s_load_b32 s56, s[0:1], 0x350
+; GFX12-NEXT: s_load_b32 s57, s[0:1], 0x360
+; GFX12-NEXT: s_load_b32 s58, s[0:1], 0x370
+; GFX12-NEXT: s_load_b32 s59, s[0:1], 0x380
+; GFX12-NEXT: s_load_b32 s60, s[0:1], 0x390
+; GFX12-NEXT: s_load_b32 s61, s[0:1], 0x3a0
+; GFX12-NEXT: s_load_b32 s62, s[0:1], 0x3b0
+; GFX12-NEXT: s_load_b32 s63, s[0:1], 0x3c0
+; GFX12-NEXT: s_load_b32 s64, s[0:1], 0x3d0
+; GFX12-NEXT: s_load_b32 s65, s[0:1], 0x3e0
+; GFX12-NEXT: s_load_b32 s66, s[0:1], 0x3f0
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s67, s[0:1], 0x400
+; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x410
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s2
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s3
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s4
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s5
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s6
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s7
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s8
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s9
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s10
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s11
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s12
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s13
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s14
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s15
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s16
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s17
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s18
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s19
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s20
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s21
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s22
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s23
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s24
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s25
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s26
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s27
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s28
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s29
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s30
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s31
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s33
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s34
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s35
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s36
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s37
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s38
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s39
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s40
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s41
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s42
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s43
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s44
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s45
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s46
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s47
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s48
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s49
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s50
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s51
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s52
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s53
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s54
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s55
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s56
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s57
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s58
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s59
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s60
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s61
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s62
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s63
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s64
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s65
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s66
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s67
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: ;;#ASMSTART
+; GFX12-NEXT: ; use s0
+; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_endpgm
+ %v0 = load i32, ptr addrspace(1) %p
+ %ptr1 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 1
+ %v1 = load i32, ptr addrspace(1) %ptr1
+ %ptr2 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 2
+ %v2 = load i32, ptr addrspace(1) %ptr2
+ %ptr3 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 3
+ %v3 = load i32, ptr addrspace(1) %ptr3
+ %ptr4 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 4
+ %v4 = load i32, ptr addrspace(1) %ptr4
+ %ptr5 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 5
+ %v5 = load i32, ptr addrspace(1) %ptr5
+ %ptr6 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 6
+ %v6 = load i32, ptr addrspace(1) %ptr6
+ %ptr7 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 7
+ %v7 = load i32, ptr addrspace(1) %ptr7
+ %ptr8 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 8
+ %v8 = load i32, ptr addrspace(1) %ptr8
+ %ptr9 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 9
+ %v9 = load i32, ptr addrspace(1) %ptr9
+ %ptr10 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 10
+ %v10 = load i32, ptr addrspace(1) %ptr10
+ %ptr11 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 11
+ %v11 = load i32, ptr addrspace(1) %ptr11
+ %ptr12 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 12
+ %v12 = load i32, ptr addrspace(1) %ptr12
+ %ptr13 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 13
+ %v13 = load i32, ptr addrspace(1) %ptr13
+ %ptr14 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 14
+ %v14 = load i32, ptr addrspace(1) %ptr14
+ %ptr15 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 15
+ %v15 = load i32, ptr addrspace(1) %ptr15
+ %ptr16 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 16
+ %v16 = load i32, ptr addrspace(1) %ptr16
+ %ptr17 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 17
+ %v17 = load i32, ptr addrspace(1) %ptr17
+ %ptr18 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 18
+ %v18 = load i32, ptr addrspace(1) %ptr18
+ %ptr19 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 19
+ %v19 = load i32, ptr addrspace(1) %ptr19
+ %ptr20 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 20
+ %v20 = load i32, ptr addrspace(1) %ptr20
+ %ptr21 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 21
+ %v21 = load i32, ptr addrspace(1) %ptr21
+ %ptr22 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 22
+ %v22 = load i32, ptr addrspace(1) %ptr22
+ %ptr23 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 23
+ %v23 = load i32, ptr addrspace(1) %ptr23
+ %ptr24 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 24
+ %v24 = load i32, ptr addrspace(1) %ptr24
+ %ptr25 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 25
+ %v25 = load i32, ptr addrspace(1) %ptr25
+ %ptr26 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 26
+ %v26 = load i32, ptr addrspace(1) %ptr26
+ %ptr27 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 27
+ %v27 = load i32, ptr addrspace(1) %ptr27
+ %ptr28 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 28
+ %v28 = load i32, ptr addrspace(1) %ptr28
+ %ptr29 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 29
+ %v29 = load i32, ptr addrspace(1) %ptr29
+ %ptr30 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 30
+ %v30 = load i32, ptr addrspace(1) %ptr30
+ %ptr31 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 31
+ %v31 = load i32, ptr addrspace(1) %ptr31
+ %ptr32 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 32
+ %v32 = load i32, ptr addrspace(1) %ptr32
+ %ptr33 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 33
+ %v33 = load i32, ptr addrspace(1) %ptr33
+ %ptr34 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 34
+ %v34 = load i32, ptr addrspace(1) %ptr34
+ %ptr35 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 35
+ %v35 = load i32, ptr addrspace(1) %ptr35
+ %ptr36 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 36
+ %v36 = load i32, ptr addrspace(1) %ptr36
+ %ptr37 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 37
+ %v37 = load i32, ptr addrspace(1) %ptr37
+ %ptr38 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 38
+ %v38 = load i32, ptr addrspace(1) %ptr38
+ %ptr39 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 39
+ %v39 = load i32, ptr addrspace(1) %ptr39
+ %ptr40 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 40
+ %v40 = load i32, ptr addrspace(1) %ptr40
+ %ptr41 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 41
+ %v41 = load i32, ptr addrspace(1) %ptr41
+ %ptr42 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 42
+ %v42 = load i32, ptr addrspace(1) %ptr42
+ %ptr43 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 43
+ %v43 = load i32, ptr addrspace(1) %ptr43
+ %ptr44 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 44
+ %v44 = load i32, ptr addrspace(1) %ptr44
+ %ptr45 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 45
+ %v45 = load i32, ptr addrspace(1) %ptr45
+ %ptr46 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 46
+ %v46 = load i32, ptr addrspace(1) %ptr46
+ %ptr47 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 47
+ %v47 = load i32, ptr addrspace(1) %ptr47
+ %ptr48 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 48
+ %v48 = load i32, ptr addrspace(1) %ptr48
+ %ptr49 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 49
+ %v49 = load i32, ptr addrspace(1) %ptr49
+ %ptr50 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 50
+ %v50 = load i32, ptr addrspace(1) %ptr50
+ %ptr51 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 51
+ %v51 = load i32, ptr addrspace(1) %ptr51
+ %ptr52 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 52
+ %v52 = load i32, ptr addrspace(1) %ptr52
+ %ptr53 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 53
+ %v53 = load i32, ptr addrspace(1) %ptr53
+ %ptr54 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 54
+ %v54 = load i32, ptr addrspace(1) %ptr54
+ %ptr55 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 55
+ %v55 = load i32, ptr addrspace(1) %ptr55
+ %ptr56 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 56
+ %v56 = load i32, ptr addrspace(1) %ptr56
+ %ptr57 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 57
+ %v57 = load i32, ptr addrspace(1) %ptr57
+ %ptr58 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 58
+ %v58 = load i32, ptr addrspace(1) %ptr58
+ %ptr59 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 59
+ %v59 = load i32, ptr addrspace(1) %ptr59
+ %ptr60 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 60
+ %v60 = load i32, ptr addrspace(1) %ptr60
+ %ptr61 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 61
+ %v61 = load i32, ptr addrspace(1) %ptr61
+ %ptr62 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 62
+ %v62 = load i32, ptr addrspace(1) %ptr62
+ %ptr63 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 63
+ %v63 = load i32, ptr addrspace(1) %ptr63
+ %ptr64 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 64
+ %v64 = load i32, ptr addrspace(1) %ptr64
+ %ptr65 = getelementptr <4 x i32>, ptr addrspace(1) %p, i64 65
+ %v65 = load i32, ptr addrspace(1) %ptr65
+
+ call void asm sideeffect "; use $0", "s"(i32 %v0)
+ call void asm sideeffect "; use $0", "s"(i32 %v1)
+ call void asm sideeffect "; use $0", "s"(i32 %v2)
+ call void asm sideeffect "; use $0", "s"(i32 %v3)
+ call void asm sideeffect "; use $0", "s"(i32 %v4)
+ call void asm sideeffect "; use $0", "s"(i32 %v5)
+ call void asm sideeffect "; use $0", "s"(i32 %v6)
+ call void asm sideeffect "; use $0", "s"(i32 %v7)
+ call void asm sideeffect "; use $0", "s"(i32 %v8)
+ call void asm sideeffect "; use $0", "s"(i32 %v9)
+ call void asm sideeffect "; use $0", "s"(i32 %v10)
+ call void asm sideeffect "; use $0", "s"(i32 %v11)
+ call void asm sideeffect "; use $0", "s"(i32 %v12)
+ call void asm sideeffect "; use $0", "s"(i32 %v13)
+ call void asm sideeffect "; use $0", "s"(i32 %v14)
+ call void asm sideeffect "; use $0", "s"(i32 %v15)
+ call void asm sideeffect "; use $0", "s"(i32 %v16)
+ call void asm sideeffect "; use $0", "s"(i32 %v17)
+ call void asm sideeffect "; use $0", "s"(i32 %v18)
+ call void asm sideeffect "; use $0", "s"(i32 %v19)
+ call void asm sideeffect "; use $0", "s"(i32 %v20)
+ call void asm sideeffect "; use $0", "s"(i32 %v21)
+ call void asm sideeffect "; use $0", "s"(i32 %v22)
+ call void asm sideeffect "; use $0", "s"(i32 %v23)
+ call void asm sideeffect "; use $0", "s"(i32 %v24)
+ call void asm sideeffect "; use $0", "s"(i32 %v25)
+ call void asm sideeffect "; use $0", "s"(i32 %v26)
+ call void asm sideeffect "; use $0", "s"(i32 %v27)
+ call void asm sideeffect "; use $0", "s"(i32 %v28)
+ call void asm sideeffect "; use $0", "s"(i32 %v29)
+ call void asm sideeffect "; use $0", "s"(i32 %v30)
+ call void asm sideeffect "; use $0", "s"(i32 %v31)
+ call void asm sideeffect "; use $0", "s"(i32 %v32)
+ call void asm sideeffect "; use $0", "s"(i32 %v33)
+ call void asm sideeffect "; use $0", "s"(i32 %v34)
+ call void asm sideeffect "; use $0", "s"(i32 %v35)
+ call void asm sideeffect "; use $0", "s"(i32 %v36)
+ call void asm sideeffect "; use $0", "s"(i32 %v37)
+ call void asm sideeffect "; use $0", "s"(i32 %v38)
+ call void asm sideeffect "; use $0", "s"(i32 %v39)
+ call void asm sideeffect "; use $0", "s"(i32 %v40)
+ call void asm sideeffect "; use $0", "s"(i32 %v41)
+ call void asm sideeffect "; use $0", "s"(i32 %v42)
+ call void asm sideeffect "; use $0", "s"(i32 %v43)
+ call void asm sideeffect "; use $0", "s"(i32 %v44)
+ call void asm sideeffect "; use $0", "s"(i32 %v45)
+ call void asm sideeffect "; use $0", "s"(i32 %v46)
+ call void asm sideeffect "; use $0", "s"(i32 %v47)
+ call void asm sideeffect "; use $0", "s"(i32 %v48)
+ call void asm sideeffect "; use $0", "s"(i32 %v49)
+ call void asm sideeffect "; use $0", "s"(i32 %v50)
+ call void asm sideeffect "; use $0", "s"(i32 %v51)
+ call void asm sideeffect "; use $0", "s"(i32 %v52)
+ call void asm sideeffect "; use $0", "s"(i32 %v53)
+ call void asm sideeffect "; use $0", "s"(i32 %v54)
+ call void asm sideeffect "; use $0", "s"(i32 %v55)
+ call void asm sideeffect "; use $0", "s"(i32 %v56)
+ call void asm sideeffect "; use $0", "s"(i32 %v57)
+ call void asm sideeffect "; use $0", "s"(i32 %v58)
+ call void asm sideeffect "; use $0", "s"(i32 %v59)
+ call void asm sideeffect "; use $0", "s"(i32 %v60)
+ call void asm sideeffect "; use $0", "s"(i32 %v61)
+ call void asm sideeffect "; use $0", "s"(i32 %v62)
+ call void asm sideeffect "; use $0", "s"(i32 %v63)
+ call void asm sideeffect "; use $0", "s"(i32 %v64)
+ call void asm sideeffect "; use $0", "s"(i32 %v65)
+
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index db0d9c1aaa216a..0992e9e300f136 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -2941,7 +2941,7 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32>
; GFX11-LABEL: v_vselect_v32f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x20
+; GFX11-NEXT: s_clause 0x1f
; GFX11-NEXT: scratch_load_b32 v31, off, s32 offset:120
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:112
; GFX11-NEXT: scratch_load_b32 v33, off, s32
>From 182deb68dc56419a809ccb10e927ffd824d40846 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Fri, 9 Feb 2024 19:57:42 +0000
Subject: [PATCH 2/2] Actually fix code format
---
llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
index 46dee9d6d04e8b..01580fe345ba26 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -177,7 +177,8 @@ class SIInsertHardClauses : public MachineFunctionPass {
bool emitClause(const ClauseInfo &CI, const SIInstrInfo *SII) {
if (CI.First == CI.Last)
return false;
- assert(CI.Length <= ST->maxHardClauseLength() && "Hard clause is too long!");
+ assert(CI.Length <= ST->maxHardClauseLength() &&
+ "Hard clause is too long!");
auto &MBB = *CI.First->getParent();
auto ClauseMI =
More information about the llvm-commits
mailing list