[llvm] 42a5560 - [AMDGPU] New SIInsertHardClauses pass
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Thu May 14 10:55:07 PDT 2020
Author: Jay Foad
Date: 2020-05-14T18:54:49+01:00
New Revision: 42a556050346ffaa0fe3c0cc4dab24ad65a115a4
URL: https://github.com/llvm/llvm-project/commit/42a556050346ffaa0fe3c0cc4dab24ad65a115a4
DIFF: https://github.com/llvm/llvm-project/commit/42a556050346ffaa0fe3c0cc4dab24ad65a115a4.diff
LOG: [AMDGPU] New SIInsertHardClauses pass
Enable clausing of memory loads on gfx10 by adding a new pass to insert
the s_clause instructions that mark the start of each hard clause.
Differential Revision: https://reviews.llvm.org/D79792
Added:
llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
llvm/test/CodeGen/AMDGPU/hard-clauses.mir
Modified:
llvm/lib/Target/AMDGPU/AMDGPU.h
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/lib/Target/AMDGPU/CMakeLists.txt
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
llvm/test/CodeGen/AMDGPU/idot2.ll
llvm/test/CodeGen/AMDGPU/idot4s.ll
llvm/test/CodeGen/AMDGPU/idot4u.ll
llvm/test/CodeGen/AMDGPU/idot8s.ll
llvm/test/CodeGen/AMDGPU/idot8u.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
llvm/test/CodeGen/AMDGPU/smrd.ll
llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index ded110742327..b39dd1bc8bae 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -229,6 +229,9 @@ extern char &SIMemoryLegalizerID;
void initializeSIModeRegisterPass(PassRegistry&);
extern char &SIModeRegisterID;
+void initializeSIInsertHardClausesPass(PassRegistry &);
+extern char &SIInsertHardClausesID;
+
void initializeSIInsertWaitcntsPass(PassRegistry&);
extern char &SIInsertWaitcntsID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 68372822b799..cac7c56360d0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -1038,6 +1038,8 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
return HasNSAtoVMEMBug;
}
+ bool hasHardClauses() const { return getGeneration() >= GFX10; }
+
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
/// SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b5a2abcd7a82..1ee7f3480f87 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -241,6 +241,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
+ initializeSIInsertHardClausesPass(*PR);
initializeSIInsertWaitcntsPass(*PR);
initializeSIModeRegisterPass(*PR);
initializeSIWholeQuadModePass(*PR);
@@ -1044,6 +1045,8 @@ void GCNPassConfig::addPreEmitPass() {
// FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would
// be better for it to emit S_NOP <N> when possible.
addPass(&PostRAHazardRecognizerID);
+ if (getOptLevel() > CodeGenOpt::None)
+ addPass(&SIInsertHardClausesID);
addPass(&SIRemoveShortExecBranchesID);
addPass(&SIPreEmitPeepholeID);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index c273ea89bd91..e82c98d4b5fd 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -107,6 +107,7 @@ add_llvm_target(AMDGPUCodeGen
SIFoldOperands.cpp
SIFormMemoryClauses.cpp
SIFrameLowering.cpp
+ SIInsertHardClauses.cpp
SIInsertSkips.cpp
SIInsertWaitcnts.cpp
SIInstrInfo.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
new file mode 100644
index 000000000000..49f4ee0dd5ef
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -0,0 +1,200 @@
+//===- SIInsertHardClauses.cpp - Insert Hard Clauses ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Insert s_clause instructions to form hard clauses.
+///
+/// Clausing load instructions can give cache coherency benefits. Before gfx10,
+/// the hardware automatically detected "soft clauses", which were sequences of
+/// memory instructions of the same type. In gfx10 this detection was removed,
+/// and the s_clause instruction was introduced to explicitly mark "hard
+/// clauses".
+///
+/// It's the scheduler's job to form the clauses by putting similar memory
+/// instructions next to each other. Our job is just to insert an s_clause
+/// instruction to mark the start of each clause.
+///
+/// Note that hard clauses are very similar to, but logically distinct from, the
+/// groups of instructions that have to be restartable when XNACK is enabled.
+/// The rules are slightly
diff erent in each case. For example an s_nop
+/// instruction breaks a restartable group, but can appear in the middle of a
+/// hard clause. (Before gfx10 there wasn't a distinction, and both were called
+/// "soft clauses" or just "clauses".)
+///
+/// The SIFormMemoryClauses pass and GCNHazardRecognizer deal with restartable
+/// groups, not hard clauses.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/SmallVector.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-insert-hard-clauses"
+
+namespace {
+
+enum HardClauseType {
+ // Texture, buffer, global or scratch memory instructions.
+ HARDCLAUSE_VMEM,
+ // Flat (not global or scratch) memory instructions.
+ HARDCLAUSE_FLAT,
+ // Instructions that access LDS.
+ HARDCLAUSE_LDS,
+ // Scalar memory instructions.
+ HARDCLAUSE_SMEM,
+ // VALU instructions.
+ HARDCLAUSE_VALU,
+ LAST_REAL_HARDCLAUSE_TYPE = HARDCLAUSE_VALU,
+
+ // Internal instructions, which are allowed in the middle of a hard clause,
+ // except for s_waitcnt.
+ HARDCLAUSE_INTERNAL,
+ // Instructions that are not allowed in a hard clause: SALU, export, branch,
+ // message, GDS, s_waitcnt and anything else not mentioned above.
+ HARDCLAUSE_ILLEGAL,
+};
+
+HardClauseType getHardClauseType(const MachineInstr &MI) {
+ // On current architectures we only get a benefit from clausing loads.
+ if (MI.mayLoad()) {
+ if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
+ return HARDCLAUSE_VMEM;
+ if (SIInstrInfo::isFLAT(MI))
+ return HARDCLAUSE_FLAT;
+ // TODO: LDS
+ if (SIInstrInfo::isSMRD(MI))
+ return HARDCLAUSE_SMEM;
+ }
+
+ // Don't form VALU clauses. It's not clear what benefit they give, if any.
+
+ // In practice s_nop is the only internal instructions we're likely to see.
+ // It's safe to treat the rest as illegal.
+ if (MI.getOpcode() == AMDGPU::S_NOP)
+ return HARDCLAUSE_INTERNAL;
+ return HARDCLAUSE_ILLEGAL;
+}
+
+class SIInsertHardClauses : public MachineFunctionPass {
+public:
+ static char ID;
+
+ SIInsertHardClauses() : MachineFunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ // Track information about a clause as we discover it.
+ struct ClauseInfo {
+ // The type of all (non-internal) instructions in the clause.
+ HardClauseType Type = HARDCLAUSE_ILLEGAL;
+ // The first (necessarily non-internal) instruction in the clause.
+ MachineInstr *First = nullptr;
+ // The last non-internal instruction in the clause.
+ MachineInstr *Last = nullptr;
+ // The length of the clause including any internal instructions in the
+ // middle.
+ unsigned Length = 0;
+ // The base operands of *Last.
+ SmallVector<const MachineOperand *, 4> BaseOps;
+ };
+
+ bool emitClause(const ClauseInfo &CI, const SIInstrInfo *SII) {
+ assert(CI.Length ==
+ std::distance(CI.First->getIterator(), CI.Last->getIterator()) + 1);
+ if (CI.Length < 2)
+ return false;
+ assert(CI.Length <= 64 && "Hard clause is too long!");
+
+ auto &MBB = *CI.First->getParent();
+ auto ClauseMI =
+ BuildMI(MBB, *CI.First, DebugLoc(), SII->get(AMDGPU::S_CLAUSE))
+ .addImm(CI.Length - 1);
+ finalizeBundle(MBB, ClauseMI->getIterator(),
+ std::next(CI.Last->getIterator()));
+ return true;
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (!ST.hasHardClauses())
+ return false;
+
+ const SIInstrInfo *SII = ST.getInstrInfo();
+ const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+
+ bool Changed = false;
+ for (auto &MBB : MF) {
+ ClauseInfo CI;
+ for (auto &MI : MBB) {
+ HardClauseType Type = getHardClauseType(MI);
+
+ int64_t Dummy1;
+ bool Dummy2;
+ SmallVector<const MachineOperand *, 4> BaseOps;
+ if (Type <= LAST_REAL_HARDCLAUSE_TYPE) {
+ if (!SII->getMemOperandsWithOffset(MI, BaseOps, Dummy1, Dummy2,
+ TRI)) {
+ // We failed to get the base operands, so we'll never clause this
+ // instruction with any other, so pretend it's illegal.
+ Type = HARDCLAUSE_ILLEGAL;
+ }
+ }
+
+ if (CI.Length == 64 ||
+ (CI.Length && Type != HARDCLAUSE_INTERNAL &&
+ (Type != CI.Type ||
+ // Note that we lie to shouldClusterMemOps about the size of the
+ // cluster. When shouldClusterMemOps is called from the machine
+ // scheduler it limits the size of the cluster to avoid increasing
+ // register pressure too much, but this pass runs after register
+ // allocation so there is no need for that kind of limit.
+ !SII->shouldClusterMemOps(CI.BaseOps, BaseOps, 2)))) {
+ // Finish the current clause.
+ Changed |= emitClause(CI, SII);
+ CI = ClauseInfo();
+ }
+
+ if (CI.Length) {
+ // Extend the current clause.
+ ++CI.Length;
+ if (Type != HARDCLAUSE_INTERNAL) {
+ CI.Last = &MI;
+ CI.BaseOps = std::move(BaseOps);
+ }
+ } else if (Type <= LAST_REAL_HARDCLAUSE_TYPE) {
+ // Start a new clause.
+ CI = ClauseInfo{Type, &MI, &MI, 1, std::move(BaseOps)};
+ }
+ }
+
+ // Finish the last clause in the basic block if any.
+ if (CI.Length)
+ Changed |= emitClause(CI, SII);
+ }
+
+ return Changed;
+ }
+};
+
+} // namespace
+
+char SIInsertHardClauses::ID = 0;
+
+char &llvm::SIInsertHardClausesID = SIInsertHardClauses::ID;
+
+INITIALIZE_PASS(SIInsertHardClauses, DEBUG_TYPE, "SI Insert Hard Clauses",
+ false, false)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index 4d9416ea78cb..836678055677 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -255,6 +255,7 @@ define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, [8 x i32]
;
; GFX10_W32-LABEL: test_div_fmas_f32:
; GFX10_W32: ; %bb.0:
+; GFX10_W32-NEXT: s_clause 0x4
; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0xb8
; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x70
; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x94
@@ -274,6 +275,7 @@ define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, [8 x i32]
;
; GFX10_W64-LABEL: test_div_fmas_f32:
; GFX10_W64: ; %bb.0:
+; GFX10_W64-NEXT: s_clause 0x4
; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0xb8
; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x70
; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x94
@@ -333,6 +335,7 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %o
;
; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_0:
; GFX10_W32: ; %bb.0:
+; GFX10_W32-NEXT: s_clause 0x3
; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0xb8
; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x94
; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x70
@@ -350,6 +353,7 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %o
;
; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_0:
; GFX10_W64: ; %bb.0:
+; GFX10_W64-NEXT: s_clause 0x3
; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0xb8
; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x94
; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x70
@@ -407,6 +411,7 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %o
;
; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_1:
; GFX10_W32: ; %bb.0:
+; GFX10_W32-NEXT: s_clause 0x3
; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x58
; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x34
; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x2c
@@ -424,6 +429,7 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %o
;
; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_1:
; GFX10_W64: ; %bb.0:
+; GFX10_W64-NEXT: s_clause 0x3
; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x58
; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x34
; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x2c
@@ -481,6 +487,7 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %o
;
; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_2:
; GFX10_W32: ; %bb.0:
+; GFX10_W32-NEXT: s_clause 0x3
; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0xb8
; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x70
; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c
@@ -498,6 +505,7 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %o
;
; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_2:
; GFX10_W64: ; %bb.0:
+; GFX10_W64-NEXT: s_clause 0x3
; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0xb8
; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x70
; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c
@@ -559,6 +567,7 @@ define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %
;
; GFX10_W32-LABEL: test_div_fmas_f64:
; GFX10_W32: ; %bb.0:
+; GFX10_W32-NEXT: s_clause 0x1
; GFX10_W32-NEXT: s_load_dword s8, s[0:1], 0x44
; GFX10_W32-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX10_W32-NEXT: ; implicit-def: $vcc_hi
@@ -577,6 +586,7 @@ define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %
;
; GFX10_W64-LABEL: test_div_fmas_f64:
; GFX10_W64: ; %bb.0:
+; GFX10_W64-NEXT: s_clause 0x1
; GFX10_W64-NEXT: s_load_dword s8, s[0:1], 0x44
; GFX10_W64-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
@@ -637,6 +647,7 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %ou
;
; GFX10_W32-LABEL: test_div_fmas_f32_cond_to_vcc:
; GFX10_W32: ; %bb.0:
+; GFX10_W32-NEXT: s_clause 0x1
; GFX10_W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10_W32-NEXT: ; implicit-def: $vcc_hi
@@ -655,6 +666,7 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %ou
;
; GFX10_W64-LABEL: test_div_fmas_f32_cond_to_vcc:
; GFX10_W64: ; %bb.0:
+; GFX10_W64-NEXT: s_clause 0x1
; GFX10_W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
@@ -712,6 +724,7 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspa
;
; GFX10_W32-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
; GFX10_W32: ; %bb.0:
+; GFX10_W32-NEXT: s_clause 0x3
; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x70
; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x94
; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c
@@ -729,6 +742,7 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspa
;
; GFX10_W64-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
; GFX10_W64: ; %bb.0:
+; GFX10_W64-NEXT: s_clause 0x3
; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x70
; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x94
; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c
@@ -784,6 +798,7 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspac
;
; GFX10_W32-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
; GFX10_W32: ; %bb.0:
+; GFX10_W32-NEXT: s_clause 0x3
; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x70
; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x94
; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c
@@ -801,6 +816,7 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspac
;
; GFX10_W64-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
; GFX10_W64: ; %bb.0:
+; GFX10_W64-NEXT: s_clause 0x3
; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x70
; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x94
; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c
@@ -901,6 +917,7 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
; GFX10_W32-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v2, vcc_lo
; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10_W32-NEXT: v_cmp_ne_u32_e64 s2, 0, s2
+; GFX10_W32-NEXT: s_clause 0x2
; GFX10_W32-NEXT: global_load_dword v1, v[1:2], off
; GFX10_W32-NEXT: global_load_dword v2, v[3:4], off offset:-4
; GFX10_W32-NEXT: global_load_dword v3, v[3:4], off
@@ -932,6 +949,7 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
; GFX10_W64-NEXT: v_add_co_ci_u32_e32 v4, vcc, 0, v2, vcc
; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX10_W64-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, s2
+; GFX10_W64-NEXT: s_clause 0x2
; GFX10_W64-NEXT: global_load_dword v1, v[1:2], off
; GFX10_W64-NEXT: global_load_dword v2, v[3:4], off offset:-4
; GFX10_W64-NEXT: global_load_dword v3, v[3:4], off
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
index f0e05a81cc0a..de2b58aa6fed 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
@@ -57,6 +57,7 @@ define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: global_load_dword v1, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -131,6 +132,7 @@ define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: global_load_dword v1, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -211,6 +213,7 @@ define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, doubl
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 8
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -291,6 +294,7 @@ define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, doubl
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 8
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -889,6 +893,7 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %
;
; GFX10-LABEL: test_div_scale_f32_all_scalar_1:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c
; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -934,6 +939,7 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %
;
; GFX10-LABEL: test_div_scale_f32_all_scalar_2:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c
; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -981,6 +987,7 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)*
;
; GFX10-LABEL: test_div_scale_f64_all_scalar_1:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1028,6 +1035,7 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)*
;
; GFX10-LABEL: test_div_scale_f64_all_scalar_2:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1229,6 +1237,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out,
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: global_load_dword v1, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
@@ -1309,6 +1318,7 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out,
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: global_load_dword v1, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
index 0164678bbf8c..28c7d47e855f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
@@ -4,6 +4,7 @@
define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
; GCN-LABEL: test_wave32:
; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_clause 0x1
; GCN-NEXT: s_load_dword s1, s[4:5], 0x0
; GCN-NEXT: s_load_dword s0, s[4:5], 0x24
; GCN-NEXT: ; implicit-def: $vcc_hi
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
index ce186792ae88..19c546bf9838 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
@@ -4,6 +4,7 @@
define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
; GCN-LABEL: test_wave32:
; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_clause 0x1
; GCN-NEXT: s_load_dword s0, s[4:5], 0x0
; GCN-NEXT: s_load_dword s1, s[4:5], 0x24
; GCN-NEXT: ; implicit-def: $vcc_hi
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
index f3ed7e219ab8..6c0fe47d5ae3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
@@ -20,6 +20,7 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
;
; GFX10-LABEL: dpp_test:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c ; encoding: [0x80,0x00,0x00,0xf4,0x2c,0x00,0x00,0xfa]
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; encoding: [0x00,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa]
; GFX10-NEXT: ; implicit-def: $vcc_hi
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
index 751f3b703024..05717460bade 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
@@ -19,6 +19,7 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2)
;
; GFX10-LABEL: dpp_test:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: ; implicit-def: $vcc_hi
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index eb1618f346e4..43c8653f2828 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -276,6 +276,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
; GFX1064-LABEL: add_i32_uniform:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: s_clause 0x1
; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c
; GFX1064-NEXT: ; implicit-def: $vgpr1
@@ -310,6 +311,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
;
; GFX1032-LABEL: add_i32_uniform:
; GFX1032: ; %bb.0: ; %entry
+; GFX1032-NEXT: s_clause 0x1
; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c
; GFX1032-NEXT: s_mov_b32 s2, exec_lo
@@ -1827,6 +1829,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
; GFX1064-LABEL: sub_i32_uniform:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: s_clause 0x1
; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c
; GFX1064-NEXT: ; implicit-def: $vgpr1
@@ -1861,6 +1864,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
;
; GFX1032-LABEL: sub_i32_uniform:
; GFX1032: ; %bb.0: ; %entry
+; GFX1032-NEXT: s_clause 0x1
; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c
; GFX1032-NEXT: s_mov_b32 s2, exec_lo
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
new file mode 100644
index 000000000000..91f86d245b80
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
@@ -0,0 +1,178 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s
+
+---
+name: long_clause
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0
+ ; CHECK-LABEL: name: long_clause
+ ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0
+ ; CHECK: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit-def $vgpr4, implicit-def $vgpr4_lo16, implicit-def $vgpr4_hi16, implicit-def $vgpr5, implicit-def $vgpr5_lo16, implicit-def $vgpr5_hi16, implicit-def $vgpr6, implicit-def $vgpr6_lo16, implicit-def $vgpr6_hi16, implicit-def $vgpr7, implicit-def $vgpr7_lo16, implicit-def $vgpr7_hi16, implicit-def $vgpr8, implicit-def $vgpr8_lo16, implicit-def $vgpr8_hi16, implicit-def $vgpr9, implicit-def $vgpr9_lo16, implicit-def $vgpr9_hi16, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr13, implicit-def $vgpr13_lo16, implicit-def $vgpr13_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit-def $vgpr17, implicit-def $vgpr17_lo16, implicit-def $vgpr17_hi16, implicit-def $vgpr18, implicit-def $vgpr18_lo16, implicit-def $vgpr18_hi16, implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr23, implicit-def $vgpr23_lo16, implicit-def $vgpr23_hi16, implicit-def $vgpr24, implicit-def $vgpr24_lo16, implicit-def $vgpr24_hi16, implicit-def $vgpr25, implicit-def $vgpr25_lo16, implicit-def $vgpr25_hi16, implicit-def $vgpr26, implicit-def $vgpr26_lo16, implicit-def $vgpr26_hi16, implicit-def $vgpr27, implicit-def $vgpr27_lo16, implicit-def $vgpr27_hi16, implicit-def $vgpr28, implicit-def $vgpr28_lo16, implicit-def $vgpr28_hi16, implicit-def $vgpr29, implicit-def $vgpr29_lo16, implicit-def $vgpr29_hi16, implicit-def $vgpr30, implicit-def $vgpr30_lo16, implicit-def $vgpr30_hi16, implicit-def $vgpr31, implicit-def $vgpr31_lo16, implicit-def $vgpr31_hi16, implicit-def $vgpr32, implicit-def $vgpr32_lo16, implicit-def $vgpr32_hi16, implicit-def $vgpr33, implicit-def $vgpr33_lo16, implicit-def $vgpr33_hi16, implicit-def $vgpr34, implicit-def $vgpr34_lo16, implicit-def $vgpr34_hi16, implicit-def $vgpr35, implicit-def $vgpr35_lo16, implicit-def $vgpr35_hi16, implicit-def $vgpr36, implicit-def $vgpr36_lo16, implicit-def $vgpr36_hi16, implicit-def $vgpr37, implicit-def $vgpr37_lo16, implicit-def $vgpr37_hi16, implicit-def $vgpr38, implicit-def $vgpr38_lo16, implicit-def $vgpr38_hi16, implicit-def $vgpr39, implicit-def $vgpr39_lo16, implicit-def $vgpr39_hi16, implicit-def $vgpr40, implicit-def $vgpr40_lo16, implicit-def $vgpr40_hi16, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; CHECK: S_CLAUSE 63
+ ; CHECK: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr4 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr5 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr6 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr7 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr8 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr9 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr10 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr11 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr12 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr13 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr14 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr15 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr16 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr17 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr18 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr19 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr20 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr21 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr22 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr23 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr24 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr25 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr26 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr27 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr28 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr29 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr30 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr31 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr32 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr33 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr34 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr35 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 140, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr36 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 144, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr37 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 148, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr38 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 152, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr39 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 156, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr40 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 160, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr41 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 164, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr42 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 168, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr43 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 172, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr44 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 176, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr45 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 180, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr46 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 184, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr47 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 188, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr48 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 192, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr49 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 196, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr50 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 200, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr51 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 204, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr52 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 208, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr53 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 212, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr54 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 216, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr55 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 220, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr56 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 224, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr57 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 228, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr58 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 232, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr59 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 236, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr60 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 240, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr61 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 244, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr62 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 248, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr63 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 252, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr64 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: }
+ ; CHECK: BUNDLE implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; CHECK: S_CLAUSE 15
+ ; CHECK: $vgpr65 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 260, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr66 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 264, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr67 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 268, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr68 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 272, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr69 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 276, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr70 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 280, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr71 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 284, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr72 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 288, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr73 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 292, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr74 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 296, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr75 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 300, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr76 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 304, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr77 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 308, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr78 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 312, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr79 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 316, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: $vgpr80 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 320, 0, 0, 0, 0, 0, implicit $exec
+ ; CHECK: }
+ $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr4 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr5 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr6 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr7 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr8 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr9 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr10 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr11 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr12 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr13 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr14 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr15 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr16 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr17 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr18 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr19 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr20 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr21 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr22 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr23 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr24 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr25 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr26 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr27 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr28 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr29 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr30 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr31 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr32 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr33 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr34 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr35 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 140, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr36 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 144, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr37 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 148, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr38 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 152, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr39 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 156, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr40 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 160, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr41 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 164, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr42 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 168, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr43 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 172, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr44 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 176, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr45 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 180, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr46 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 184, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr47 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 188, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr48 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 192, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr49 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 196, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr50 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 200, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr51 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 204, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr52 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 208, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr53 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 212, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr54 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 216, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr55 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 220, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr56 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 224, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr57 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 228, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr58 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 232, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr59 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 236, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr60 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 240, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr61 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 244, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr62 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 248, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr63 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 252, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr64 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr65 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 260, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr66 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 264, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr67 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 268, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr68 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 272, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr69 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 276, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr70 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 280, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr71 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 284, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr72 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 288, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr73 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 292, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr74 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 296, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr75 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 300, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr76 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 304, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr77 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 308, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr78 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 312, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr79 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 316, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr80 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 320, 0, 0, 0, 0, 0, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
index 42921dc8c2ba..1a08586b8e95 100644
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -102,6 +102,7 @@ define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: udot2:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -244,6 +245,7 @@ define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: udot2_MulMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -377,6 +379,7 @@ define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: idot2:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -511,6 +514,7 @@ define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: idot2_MixedTypedMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -648,6 +652,7 @@ define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: udot2_alt_AddOperands:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -782,6 +787,7 @@ define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: idot2_MixedExt:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -905,6 +911,7 @@ define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: notudot2_SameVec:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -1039,6 +1046,7 @@ define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: udot2_v4i16:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -1171,6 +1179,7 @@ define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: udot2_v4i16_Hi:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -1310,6 +1319,7 @@ define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: notudot2_v4i16_Even:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -1455,6 +1465,7 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: notudot2_v4i16_Middle:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -1600,6 +1611,7 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: notudot2_DiffIndex:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -1749,6 +1761,7 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
;
; GFX10-DL-LABEL: udot2_MultipleUses_add1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -1896,6 +1909,7 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
;
; GFX10-DL-LABEL: idot2_MultipleUses_add1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -2047,6 +2061,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
;
; GFX10-DL-LABEL: udot2_MultipleUses_mul1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -2195,6 +2210,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
;
; GFX10-DL-LABEL: idot2_MultipleUses_mul1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -2347,6 +2363,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
;
; GFX10-DL-LABEL: udot2_MultipleUses_mul2:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -2495,6 +2512,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
;
; GFX10-DL-LABEL: idot2_MultipleUses_mul2:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -2786,6 +2804,7 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: notsdot2_sext8:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 2a76184352a4..baac247930a5 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -119,6 +119,7 @@ define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: idot4_acc32:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -641,6 +642,7 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: idot4_multiuse_mul1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -839,6 +841,7 @@ define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: idot4_acc32_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 06b758fa285d..58f268f16d46 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -122,6 +122,7 @@ define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: udot4_acc32:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -1138,6 +1139,7 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: udot4_multiuse_mul1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -1345,6 +1347,7 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: udot4_multiuse_add1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -1743,6 +1746,7 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: udot4_acc32_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index 38b8bc37f5f2..0fb137239942 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -167,6 +167,7 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: idot8_acc32:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -1130,6 +1131,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: idot8_multiuses_mul1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -1401,6 +1403,7 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: idot8_acc32_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index 21eede8df373..ba5c4d2f2b5f 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -167,6 +167,7 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: udot8_acc32:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -1654,6 +1655,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: udot8_multiuses_mul1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -1924,6 +1926,7 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: udot8_acc32_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
@@ -3085,6 +3088,7 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr,
;
; GFX10-DL-LABEL: udot8_variant1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
index 9f18f4df40bf..e79920bb2d1b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
@@ -120,6 +120,7 @@ entry:
;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged_and:
;CHECK-NEXT: %bb.
+;GFX10-NEXT: s_clause
;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
;CHECK: s_waitcnt
@@ -145,6 +146,7 @@ main_body:
;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged_or:
;CHECK-NEXT: %bb.
;CHECK-NEXT: v_lshlrev_b32_e32 v{{[0-9]}}, 6, v0
+;GFX10-NEXT: s_clause
;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4
;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:28
;CHECK: s_waitcnt
@@ -170,6 +172,7 @@ main_body:
;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged_glc_slc:
;CHECK-NEXT: %bb.
+;GFX10-NEXT: s_clause
;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}}
;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}}
;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}}
@@ -233,6 +236,7 @@ main_body:
;CHECK-LABEL: {{^}}buffer_load_x1_offset_merged:
;CHECK-NEXT: %bb.
+;GFX10-NEXT: s_clause
;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28
;CHECK: s_waitcnt
@@ -402,6 +406,7 @@ main_body:
;CHECK-LABEL: {{^}}raw_buffer_load_x1_offset_merged:
;CHECK-NEXT: %bb.
+;GFX10-NEXT: s_clause
;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28
;CHECK: s_waitcnt
@@ -420,6 +425,7 @@ main_body:
;CHECK-LABEL: {{^}}raw_buffer_load_x1_offset_swizzled_not_merged:
;CHECK-NEXT: %bb.
+;GFX10-NEXT: s_clause
;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:4
;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:8
;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:12
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index cdac927bdd74..3965b460cb1d 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -154,6 +154,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out,
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v3, v[0:1], off
; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2
@@ -986,6 +987,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out,
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_ushort v3, v[0:1], off
; GFX10-NEXT: global_load_ushort v4, v[0:1], off
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll
index 284a118f4cd1..639ed313522c 100644
--- a/llvm/test/CodeGen/AMDGPU/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd.ll
@@ -377,6 +377,7 @@ main_body:
; GCN-NEXT: %bb.
; SICI-NEXT: s_buffer_load_dwordx4 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x1
; SICI-NEXT: s_buffer_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x7
+; GFX10-NEXT: s_clause
; VIGFX9_10-NEXT: s_buffer_load_dwordx4 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x4
; VIGFX9_10-NEXT: s_buffer_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x1c
define amdgpu_ps void @smrd_imm_merged(<4 x i32> inreg %desc) #0 {
@@ -447,6 +448,7 @@ main_body:
; GCN-LABEL: {{^}}smrd_vgpr_merged:
; GCN-NEXT: %bb.
+; GFX10-NEXT: s_clause
; GCN-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
; GCN-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
define amdgpu_ps void @smrd_vgpr_merged(<4 x i32> inreg %desc, i32 %a) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
index c3f41a467a20..f323cf7d76f2 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
@@ -13,6 +13,7 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(<4 x i32>* %arg) {
; GCN-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 8
; GCN-NEXT: s_mov_b32 s5, exec_lo
; GCN-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GCN-NEXT: s_clause 0x1
; GCN-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; GCN-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GCN-NEXT: BB0_2: ; Parent Loop BB0_1 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
index c6f2ec5f3e34..4848ec6c4c38 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -67,10 +67,10 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX10: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX10: buffer_load_dword v43, off, s[0:3], s33
+; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12
; GFX10: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX10: s_setpc_b64 s[4:5]
@@ -149,12 +149,12 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[36:43], s[44:47] dmask:0x1
-; GFX10: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX10: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX10: buffer_load_dword v44, off, s[0:3], s33
+; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4
+; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16
+; GFX10: buffer_load_dword v45, off, s[0:3], s32 offset:20
; GFX10: s_setpc_b64 s[4:5]
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
More information about the llvm-commits
mailing list