[llvm] [AMDGPU] Propagate alias information in AMDGPULowerKernelArguments. (PR #144714)
Leon Clark via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 18 20:15:12 PDT 2025
https://github.com/PeddleSpam updated https://github.com/llvm/llvm-project/pull/144714
>From c4745be7beaf4873538991a0d8aa9063e79cfc5c Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Wed, 18 Jun 2025 15:39:48 +0100
Subject: [PATCH 1/2] [AMDGPU] Propagate alias information in
AMDGPULowerKernelArguments.
This patch reimplements https://reviews.llvm.org/D108363 and https://reviews.llvm.org/D108361 to emit !noalias and !alias.scope metadata for noalias kernel arguments.
---
llvm/include/llvm/Transforms/Utils/Cloning.h | 11 +
.../AMDGPU/AMDGPULowerKernelArguments.cpp | 11 +-
llvm/lib/Transforms/Utils/InlineFunction.cpp | 100 +-
llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 200 +-
llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 280 +--
llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 78 +-
.../llvm.amdgcn.sched.group.barrier.gfx11.ll | 198 +-
...vm.amdgcn.sched.group.barrier.iterative.ll | 2109 ++++++++++++-----
.../AMDGPU/llvm.amdgcn.sched.group.barrier.ll | 526 ++--
llvm/test/CodeGen/AMDGPU/lower-kernargs.ll | 55 +-
.../AMDGPU/ptr-buffer-alias-scheduling.ll | 88 +-
llvm/test/CodeGen/AMDGPU/sub.ll | 58 +-
12 files changed, 2438 insertions(+), 1276 deletions(-)
diff --git a/llvm/include/llvm/Transforms/Utils/Cloning.h b/llvm/include/llvm/Transforms/Utils/Cloning.h
index 6b56230a6e1d4..05490e6c81bc8 100644
--- a/llvm/include/llvm/Transforms/Utils/Cloning.h
+++ b/llvm/include/llvm/Transforms/Utils/Cloning.h
@@ -363,6 +363,17 @@ LLVM_ABI void updateProfileCallee(
Function *Callee, int64_t EntryDelta,
const ValueMap<const Value *, WeakTrackingVH> *VMap = nullptr);
+/// Adds `!noalias` and `!alias.scope` metadata for `CB`'s called function's
+/// `noalias` argument based memory accesses.
+void addAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
+ const DataLayout &DL, AAResults *CalleeAAR,
+ ClonedCodeInfo &InlinedFunctionInfo,
+ bool UseNoAliasIntrinsic);
+
+/// Adds `!noalias` and `!alias.scope` metadata for `F`'s `noalias` argument
+/// based memory accesses.
+void addAliasScopeMetadata(Function &F);
+
/// Find the 'llvm.experimental.noalias.scope.decl' intrinsics in the specified
/// basic blocks and extract their scope. These are candidates for duplication
/// when cloning.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index dec781d71c54e..edd19e1ef1241 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -20,6 +20,7 @@
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/Cloning.h"
#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
@@ -86,6 +87,9 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
uint64_t ExplicitArgOffset = 0;
+
+ addAliasScopeMetadata(F);
+
for (Argument &Arg : F.args()) {
const bool IsByRef = Arg.hasByRefAttr();
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
@@ -124,11 +128,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) &&
!ST.hasUsableDSOffset())
continue;
-
- // FIXME: We can replace this with equivalent alias.scope/noalias
- // metadata, but this appears to be a lot of work.
- if (Arg.hasNoAliasAttr())
- continue;
}
auto *VT = dyn_cast<FixedVectorType>(ArgTy);
@@ -215,8 +214,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
}
}
- // TODO: Convert noalias arg to !noalias
-
if (DoShiftOpt) {
Value *ExtractBits = OffsetDiff == 0 ?
Load : Builder.CreateLShr(Load, OffsetDiff * 8);
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 7df5e9958182c..a56dc39e569c0 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -51,6 +51,7 @@
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstIterator.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
@@ -1114,17 +1115,30 @@ void ScopedAliasMetadataDeepCloner::remap(Function::iterator FStart,
/// then add new alias scopes for each noalias argument, tag the mapped noalias
/// parameters with noalias metadata specifying the new scope, and tag all
/// non-derived loads, stores and memory intrinsics with the new alias scopes.
-static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
- const DataLayout &DL, AAResults *CalleeAAR,
- ClonedCodeInfo &InlinedFunctionInfo) {
- if (!EnableNoAliasConversion)
- return;
-
- const Function *CalledFunc = CB.getCalledFunction();
+static void addAliasScopeMetadataImpl(CallBase *CB, Function *F,
+ ValueToValueMapTy *VMap,
+ const DataLayout &DL,
+ AAResults *CalleeAAR,
+ ClonedCodeInfo *InlinedFunctionInfo,
+ bool UseNoAliasIntrinsic) {
+ assert(CB || F);
+ const Function *CalledFunc = CB ? CB->getCalledFunction() : F;
SmallVector<const Argument *, 4> NoAliasArgs;
+ std::function<bool(const Argument *, Attribute::AttrKind)> paramHasAttr;
+ if (CB) {
+ paramHasAttr = [&](const Argument *Arg, Attribute::AttrKind Attr) -> bool {
+ return CB->paramHasAttr(Arg->getArgNo(), Attr);
+ };
+
+ } else {
+ paramHasAttr = [&](const Argument *Arg, Attribute::AttrKind Attr) -> bool {
+ return Arg->hasAttribute(Attr);
+ };
+ }
+
for (const Argument &Arg : CalledFunc->args())
- if (CB.paramHasAttr(Arg.getArgNo(), Attribute::NoAlias) && !Arg.use_empty())
+ if (paramHasAttr(&Arg, Attribute::NoAlias) && !Arg.use_empty())
NoAliasArgs.push_back(&Arg);
if (NoAliasArgs.empty())
@@ -1166,29 +1180,20 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
NewScopes.insert(std::make_pair(A, NewScope));
if (UseNoAliasIntrinsic) {
+ assert(CB);
// Introduce a llvm.experimental.noalias.scope.decl for the noalias
// argument.
MDNode *AScopeList = MDNode::get(CalledFunc->getContext(), NewScope);
auto *NoAliasDecl =
- IRBuilder<>(&CB).CreateNoAliasScopeDeclaration(AScopeList);
+ IRBuilder<>(CB).CreateNoAliasScopeDeclaration(AScopeList);
// Ignore the result for now. The result will be used when the
// llvm.noalias intrinsic is introduced.
(void)NoAliasDecl;
}
}
- // Iterate over all new instructions in the map; for all memory-access
- // instructions, add the alias scope metadata.
- for (ValueToValueMapTy::iterator VMI = VMap.begin(), VMIE = VMap.end();
- VMI != VMIE; ++VMI) {
- if (const Instruction *I = dyn_cast<Instruction>(VMI->first)) {
- if (!VMI->second)
- continue;
-
- Instruction *NI = dyn_cast<Instruction>(VMI->second);
- if (!NI || InlinedFunctionInfo.isSimplified(I, NI))
- continue;
-
+ {
+ auto addAliasMD = [&](const Instruction *I, Instruction *NI) -> void {
bool IsArgMemOnlyCall = false, IsFuncCall = false;
SmallVector<const Value *, 2> PtrArgs;
@@ -1207,7 +1212,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
// know that about the inlined clone of this call site, and we don't
// need to add metadata.
if (Call->doesNotAccessMemory())
- continue;
+ return;
IsFuncCall = true;
if (CalleeAAR) {
@@ -1215,7 +1220,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
// We'll retain this knowledge without additional metadata.
if (ME.onlyAccessesInaccessibleMem())
- continue;
+ return;
if (ME.onlyAccessesArgPointees())
IsArgMemOnlyCall = true;
@@ -1237,7 +1242,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
// However, if this is a call, this we might just alias with none of the
// noalias arguments.
if (PtrArgs.empty() && !IsFuncCall)
- continue;
+ return;
// It is possible that there is only one underlying object, but you
// need to go through several PHIs to see it, and thus could be
@@ -1270,7 +1275,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
// completely describe the aliasing properties using alias.scope
// metadata (and, thus, won't add any).
if (const Argument *A = dyn_cast<Argument>(V)) {
- if (!CB.paramHasAttr(A->getArgNo(), Attribute::NoAlias))
+ if (!paramHasAttr(A, Attribute::NoAlias))
UsesAliasingPtr = true;
} else {
UsesAliasingPtr = true;
@@ -1292,7 +1297,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
// Nothing we can do if the used underlying object cannot be reliably
// determined.
if (UsesUnknownObject)
- continue;
+ return;
// A function call can always get captured noalias pointers (via other
// parameters, globals, etc.).
@@ -1353,10 +1358,49 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
LLVMContext::MD_alias_scope,
MDNode::concatenate(NI->getMetadata(LLVMContext::MD_alias_scope),
MDNode::get(CalledFunc->getContext(), Scopes)));
+ };
+
+ if (VMap) {
+ assert(InlinedFunctionInfo);
+
+ for (ValueToValueMapTy::iterator VMI = VMap->begin(), VMIE = VMap->end();
+ VMI != VMIE; ++VMI) {
+ const Instruction *I = dyn_cast<Instruction>(VMI->first);
+ if (!I || !VMI->second)
+ continue;
+
+ Instruction *NI = dyn_cast<Instruction>(VMI->second);
+ if (!NI || InlinedFunctionInfo->isSimplified(I, NI))
+ continue;
+
+ addAliasMD(I, NI);
+ }
+
+ } else {
+ for (auto It = inst_begin(F), End = inst_end(F); It != End; ++It) {
+ Instruction *I = &(*It);
+ addAliasMD(I, I);
+ }
}
}
}
+void llvm::addAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
+ const DataLayout &DL, AAResults *CalleeAAR,
+ ClonedCodeInfo &InlinedFunctionInfo,
+ bool UseNoAliasIntrinsic) {
+ addAliasScopeMetadataImpl(&CB, /* F */ nullptr, &VMap, DL, CalleeAAR,
+ &InlinedFunctionInfo, UseNoAliasIntrinsic);
+}
+
+void llvm::addAliasScopeMetadata(Function &F) {
+ addAliasScopeMetadataImpl(/* CB */ nullptr, &F, /* VMap */ nullptr,
+ F.getParent()->getDataLayout(),
+ /* CalleeAAR */ nullptr,
+ /* InlinedFunctionInfo */ nullptr,
+ /* UseNoAliasIntrinsic */ false);
+}
+
static bool MayContainThrowingOrExitingCallAfterCB(CallBase *Begin,
ReturnInst *End) {
@@ -2797,7 +2841,9 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
SAMetadataCloner.remap(FirstNewBlock, Caller->end());
// Add noalias metadata if necessary.
- AddAliasScopeMetadata(CB, VMap, DL, CalleeAAR, InlinedFunctionInfo);
+ if (EnableNoAliasConversion)
+ addAliasScopeMetadata(CB, VMap, DL, CalleeAAR, InlinedFunctionInfo,
+ UseNoAliasIntrinsic);
// Clone return attributes on the callsite into the calls within the inlined
// function which feed into its return value.
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 99b7c7737f4ae..a87baca5a5878 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -105,11 +105,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out,
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbh_u32_e32 v2, v0
+; VI-NEXT: flat_load_dword v2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_ffbh_u32_e32 v2, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -181,8 +181,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbh_u32_e32 v1, v1
; VI-NEXT: v_ffbh_u32_e32 v0, v0
@@ -261,8 +261,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbh_u32_e32 v3, v3
; VI-NEXT: v_ffbh_u32_e32 v2, v2
@@ -534,13 +534,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
; VI-LABEL: s_ctlz_zero_undef_i64_with_select:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_flbit_i32_b64 s2, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_flbit_i32_b64 s0, s[2:3]
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_ctlz_zero_undef_i64_with_select:
@@ -605,15 +605,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v0
-; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; VI-NEXT: v_ffbh_u32_e32 v1, v1
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
+; VI-NEXT: flat_load_ubyte v2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; VI-NEXT: v_ffbh_u32_e32 v3, v3
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, 32, v3, vcc
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -706,21 +706,21 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ubyte v2, v[2:3]
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_readfirstlane_b32 s2, v2
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_readfirstlane_b32 s3, v0
-; VI-NEXT: s_lshl_b32 s2, s2, 8
-; VI-NEXT: s_or_b32 s2, s2, s3
-; VI-NEXT: s_lshl_b32 s3, s2, 16
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-NEXT: s_flbit_i32_b32 s3, s3
-; VI-NEXT: s_cmp_lg_u32 s2, 0
-; VI-NEXT: s_cselect_b32 s2, s3, 32
+; VI-NEXT: flat_load_ubyte v3, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_readfirstlane_b32 s0, v2
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_readfirstlane_b32 s1, v3
+; VI-NEXT: s_lshl_b32 s0, s0, 8
+; VI-NEXT: s_or_b32 s0, s0, s1
+; VI-NEXT: s_lshl_b32 s1, s0, 16
+; VI-NEXT: s_and_b32 s0, s0, 0xffff
+; VI-NEXT: s_flbit_i32_b32 s1, s1
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_cselect_b32 s0, s1, 32
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -814,37 +814,37 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s4, s2, 3
+; VI-NEXT: s_add_u32 s4, s2, 1
; VI-NEXT: s_addc_u32 s5, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: s_add_u32 s4, s2, 2
+; VI-NEXT: s_add_u32 s4, s2, 3
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_addc_u32 s5, s3, 0
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_add_u32 s2, s2, 1
-; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: s_add_u32 s2, s2, 2
; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v7, s3
; VI-NEXT: v_mov_b32_e32 v6, s2
; VI-NEXT: flat_load_ubyte v2, v[2:3]
-; VI-NEXT: flat_load_ubyte v3, v[4:5]
-; VI-NEXT: flat_load_ubyte v4, v[6:7]
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: flat_load_ubyte v3, v[0:1]
+; VI-NEXT: flat_load_ubyte v4, v[4:5]
+; VI-NEXT: flat_load_ubyte v5, v[6:7]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
+; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_e32 v0, v2, v0
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: v_ffbh_u32_e32 v0, v0
-; VI-NEXT: v_min_u32_e32 v2, 32, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: v_ffbh_u32_e32 v2, v2
+; VI-NEXT: v_min_u32_e32 v2, 32, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -965,29 +965,30 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
; VI-NEXT: s_addc_u32 s5, s3, 0
; VI-NEXT: v_mov_b32_e32 v7, s5
; VI-NEXT: v_mov_b32_e32 v6, s4
-; VI-NEXT: s_add_u32 s4, s2, 3
+; VI-NEXT: s_add_u32 s4, s2, 1
; VI-NEXT: s_addc_u32 s5, s3, 0
; VI-NEXT: v_mov_b32_e32 v9, s5
; VI-NEXT: v_mov_b32_e32 v8, s4
-; VI-NEXT: s_add_u32 s4, s2, 2
+; VI-NEXT: s_add_u32 s4, s2, 3
; VI-NEXT: flat_load_ubyte v10, v[0:1]
; VI-NEXT: flat_load_ubyte v11, v[2:3]
; VI-NEXT: flat_load_ubyte v12, v[4:5]
; VI-NEXT: flat_load_ubyte v6, v[6:7]
; VI-NEXT: flat_load_ubyte v7, v[8:9]
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_addc_u32 s5, s3, 0
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_add_u32 s4, s2, 1
-; VI-NEXT: s_addc_u32 s5, s3, 0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_add_u32 s2, s2, 2
; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: flat_load_ubyte v8, v[0:1]
; VI-NEXT: flat_load_ubyte v2, v[2:3]
; VI-NEXT: flat_load_ubyte v3, v[4:5]
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v10
; VI-NEXT: s_waitcnt vmcnt(6)
@@ -1001,19 +1002,18 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7
; VI-NEXT: v_ffbh_u32_e32 v4, v4
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v5, v5, v8
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_e32 v2, v2, v3
-; VI-NEXT: v_or_b32_e32 v0, v0, v2
-; VI-NEXT: v_ffbh_u32_e32 v0, v0
-; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
-; VI-NEXT: v_min_u32_e32 v0, v0, v4
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_min_u32_e32 v0, 64, v0
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v2, v5
+; VI-NEXT: v_ffbh_u32_e32 v2, v2
+; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2
+; VI-NEXT: v_min_u32_e32 v2, v2, v4
+; VI-NEXT: v_min_u32_e32 v2, 64, v2
+; VI-NEXT: v_mov_b32_e32 v3, 0
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_ctlz_zero_undef_i64_with_select:
@@ -1119,12 +1119,12 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; VI-NEXT: v_ffbh_u32_e32 v2, v0
+; VI-NEXT: flat_load_ubyte v2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; VI-NEXT: v_ffbh_u32_e32 v2, v2
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1259,10 +1259,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_flbit_i32_b64 s2, s[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_flbit_i32_b64 s0, s[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1505,11 +1505,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbh_u32_e32 v2, v0
+; VI-NEXT: flat_load_dword v2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_ffbh_u32_e32 v2, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1584,11 +1584,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbh_u32_e32 v2, v0
+; VI-NEXT: flat_load_dword v2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_ffbh_u32_e32 v2, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1661,11 +1661,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbh_u32_e32 v2, v0
+; VI-NEXT: flat_load_ubyte v2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_ffbh_u32_e32 v2, v2
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1858,13 +1858,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbh_u32_e32 v1, v0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_ffbh_u32_e32 v3, v2
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1942,13 +1942,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbh_u32_e32 v1, v0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_ffbh_u32_e32 v3, v2
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -2026,13 +2026,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbh_u32_e32 v1, v0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
-; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_ffbh_u32_e32 v3, v2
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -2111,13 +2111,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbh_u32_e32 v1, v0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
-; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_ffbh_u32_e32 v3, v2
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 73fddb53d1dcc..23d5cb73e8dd4 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -92,11 +92,11 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out,
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbl_b32_e32 v2, v0
+; VI-NEXT: flat_load_dword v2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_ffbl_b32_e32 v2, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -168,8 +168,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbl_b32_e32 v1, v1
; VI-NEXT: v_ffbl_b32_e32 v0, v0
@@ -248,8 +248,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbl_b32_e32 v3, v3
; VI-NEXT: v_ffbl_b32_e32 v2, v2
@@ -511,13 +511,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
; VI-LABEL: s_cttz_zero_undef_i64_with_select:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ff1_i32_b64 s2, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_ff1_i32_b64 s0, s[2:3]
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_cttz_zero_undef_i64_with_select:
@@ -581,14 +581,14 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbl_b32_e32 v1, v0
-; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
+; VI-NEXT: flat_load_ubyte v2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_ffbl_b32_e32 v3, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, 32, v3, vcc
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -677,17 +677,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ubyte v2, v[2:3]
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: v_ffbl_b32_e32 v1, v0
-; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
+; VI-NEXT: flat_load_ubyte v3, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_e32 v2, v2, v3
+; VI-NEXT: v_ffbl_b32_e32 v3, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, 32, v3, vcc
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -778,37 +778,37 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s4, s2, 3
+; VI-NEXT: s_add_u32 s4, s2, 1
; VI-NEXT: s_addc_u32 s5, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: s_add_u32 s4, s2, 2
+; VI-NEXT: s_add_u32 s4, s2, 3
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_addc_u32 s5, s3, 0
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_add_u32 s2, s2, 1
-; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: s_add_u32 s2, s2, 2
; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v7, s3
; VI-NEXT: v_mov_b32_e32 v6, s2
; VI-NEXT: flat_load_ubyte v2, v[2:3]
-; VI-NEXT: flat_load_ubyte v3, v[4:5]
-; VI-NEXT: flat_load_ubyte v4, v[6:7]
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: flat_load_ubyte v3, v[0:1]
+; VI-NEXT: flat_load_ubyte v4, v[4:5]
+; VI-NEXT: flat_load_ubyte v5, v[6:7]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
+; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_e32 v0, v2, v0
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: v_ffbl_b32_e32 v0, v0
-; VI-NEXT: v_min_u32_e32 v2, 32, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: v_ffbl_b32_e32 v2, v2
+; VI-NEXT: v_min_u32_e32 v2, 32, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -929,55 +929,55 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
; VI-NEXT: s_addc_u32 s5, s3, 0
; VI-NEXT: v_mov_b32_e32 v7, s5
; VI-NEXT: v_mov_b32_e32 v6, s4
-; VI-NEXT: s_add_u32 s4, s2, 3
+; VI-NEXT: s_add_u32 s4, s2, 1
; VI-NEXT: s_addc_u32 s5, s3, 0
; VI-NEXT: v_mov_b32_e32 v9, s5
; VI-NEXT: v_mov_b32_e32 v8, s4
-; VI-NEXT: s_add_u32 s4, s2, 2
-; VI-NEXT: flat_load_ubyte v10, v[0:1]
-; VI-NEXT: flat_load_ubyte v11, v[2:3]
-; VI-NEXT: flat_load_ubyte v12, v[4:5]
-; VI-NEXT: flat_load_ubyte v6, v[6:7]
-; VI-NEXT: flat_load_ubyte v7, v[8:9]
+; VI-NEXT: s_add_u32 s4, s2, 3
+; VI-NEXT: v_mov_b32_e32 v11, s3
; VI-NEXT: s_addc_u32 s5, s3, 0
+; VI-NEXT: v_mov_b32_e32 v10, s2
+; VI-NEXT: flat_load_ubyte v12, v[0:1]
+; VI-NEXT: flat_load_ubyte v13, v[2:3]
+; VI-NEXT: flat_load_ubyte v4, v[4:5]
+; VI-NEXT: flat_load_ubyte v5, v[6:7]
+; VI-NEXT: flat_load_ubyte v6, v[8:9]
+; VI-NEXT: flat_load_ubyte v7, v[10:11]
; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: s_add_u32 s2, s2, 2
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_add_u32 s4, s2, 1
-; VI-NEXT: s_addc_u32 s5, s3, 0
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: v_mov_b32_e32 v5, s3
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: flat_load_ubyte v8, v[0:1]
; VI-NEXT: flat_load_ubyte v2, v[2:3]
-; VI-NEXT: flat_load_ubyte v3, v[4:5]
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(7)
-; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v10
+; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v12
; VI-NEXT: s_waitcnt vmcnt(6)
-; VI-NEXT: v_or_b32_e32 v4, v4, v11
+; VI-NEXT: v_or_b32_e32 v3, v3, v13
; VI-NEXT: s_waitcnt vmcnt(5)
-; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v12
+; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; VI-NEXT: s_waitcnt vmcnt(4)
-; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v4, v5, v4
+; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v3, v4, v3
; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7
-; VI-NEXT: v_ffbl_b32_e32 v4, v4
-; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v4
+; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v4, v4, v7
+; VI-NEXT: v_ffbl_b32_e32 v3, v3
+; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v3
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v8
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_e32 v2, v2, v3
-; VI-NEXT: v_or_b32_e32 v0, v0, v2
-; VI-NEXT: v_ffbl_b32_e32 v0, v0
-; VI-NEXT: v_min_u32_e32 v0, v4, v0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_min_u32_e32 v0, 64, v0
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v2, v4
+; VI-NEXT: v_ffbl_b32_e32 v2, v2
+; VI-NEXT: v_min_u32_e32 v2, v3, v2
+; VI-NEXT: v_min_u32_e32 v2, 64, v2
+; VI-NEXT: v_mov_b32_e32 v3, 0
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_cttz_zero_undef_i64_with_select:
@@ -1091,36 +1091,36 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s4, s2, 3
+; VI-NEXT: s_add_u32 s4, s2, 1
; VI-NEXT: s_addc_u32 s5, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: s_add_u32 s4, s2, 2
+; VI-NEXT: s_add_u32 s4, s2, 3
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_addc_u32 s5, s3, 0
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_add_u32 s2, s2, 1
-; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: s_add_u32 s2, s2, 2
; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v7, s3
; VI-NEXT: v_mov_b32_e32 v6, s2
; VI-NEXT: flat_load_ubyte v2, v[2:3]
-; VI-NEXT: flat_load_ubyte v3, v[4:5]
-; VI-NEXT: flat_load_ubyte v4, v[6:7]
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: flat_load_ubyte v3, v[0:1]
+; VI-NEXT: flat_load_ubyte v4, v[4:5]
+; VI-NEXT: flat_load_ubyte v5, v[6:7]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
+; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_e32 v0, v2, v0
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: v_ffbl_b32_e32 v2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: v_ffbl_b32_e32 v2, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1213,36 +1213,36 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s4, s2, 3
+; VI-NEXT: s_add_u32 s4, s2, 1
; VI-NEXT: s_addc_u32 s5, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: s_add_u32 s4, s2, 2
+; VI-NEXT: s_add_u32 s4, s2, 3
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_addc_u32 s5, s3, 0
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_add_u32 s2, s2, 1
-; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: s_add_u32 s2, s2, 2
; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v7, s3
; VI-NEXT: v_mov_b32_e32 v6, s2
; VI-NEXT: flat_load_ubyte v2, v[2:3]
-; VI-NEXT: flat_load_ubyte v3, v[4:5]
-; VI-NEXT: flat_load_ubyte v4, v[6:7]
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: flat_load_ubyte v3, v[0:1]
+; VI-NEXT: flat_load_ubyte v4, v[4:5]
+; VI-NEXT: flat_load_ubyte v5, v[6:7]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
+; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_e32 v0, v2, v0
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: v_ffbl_b32_e32 v2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: v_ffbl_b32_e32 v2, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1338,39 +1338,39 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s4, s2, 3
+; VI-NEXT: s_add_u32 s4, s2, 1
; VI-NEXT: s_addc_u32 s5, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: s_add_u32 s4, s2, 2
+; VI-NEXT: s_add_u32 s4, s2, 3
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_addc_u32 s5, s3, 0
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_add_u32 s2, s2, 1
-; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: s_add_u32 s2, s2, 2
; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v7, s3
; VI-NEXT: v_mov_b32_e32 v6, s2
; VI-NEXT: flat_load_ubyte v2, v[2:3]
-; VI-NEXT: flat_load_ubyte v3, v[4:5]
-; VI-NEXT: flat_load_ubyte v4, v[6:7]
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: flat_load_ubyte v3, v[0:1]
+; VI-NEXT: flat_load_ubyte v4, v[4:5]
+; VI-NEXT: flat_load_ubyte v5, v[6:7]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
+; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_e32 v0, v2, v0
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: v_ffbl_b32_e32 v0, v0
-; VI-NEXT: v_min_u32_e32 v0, 32, v0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
-; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: v_ffbl_b32_e32 v2, v2
+; VI-NEXT: v_min_u32_e32 v2, 32, v2
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1455,11 +1455,11 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbl_b32_e32 v2, v0
+; VI-NEXT: flat_load_ubyte v2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_ffbl_b32_e32 v2, v2
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1541,19 +1541,19 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s4, s2, 1
; VI-NEXT: s_addc_u32 s5, s3, 0
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: flat_load_ubyte v4, v[0:1]
; VI-NEXT: flat_load_ubyte v2, v[2:3]
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: v_ffbl_b32_e32 v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: v_ffbl_b32_e32 v2, v2
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 745e047348626..167fa469945a6 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1466,10 +1466,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[12:15], 0 addr64 offset:2
; SI-NEXT: s_mov_b32 s10, -1
-; SI-NEXT: s_mov_b32 s8, s2
-; SI-NEXT: s_mov_b32 s9, s3
-; SI-NEXT: s_mov_b32 s2, s10
-; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s6, s10
+; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: s_mov_b32 s4, s2
+; SI-NEXT: s_mov_b32 s5, s3
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v2
; SI-NEXT: s_waitcnt vmcnt(1)
@@ -1485,15 +1487,15 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_alignbit_b32 v4, v4, v5, 24
; SI-NEXT: v_or_b32_e32 v4, v4, v6
-; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s8, 0x4000405
+; VI-NEXT: s_mov_b32 s12, 0x4000405
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v0
@@ -1515,10 +1517,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
; VI-NEXT: flat_load_ubyte v4, v[0:1]
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s10, s6
+; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_mov_b32 s8, s2
+; VI-NEXT: s_mov_b32 s9, s3
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v6
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v6
@@ -1531,9 +1535,9 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
; VI-NEXT: v_or_b32_e32 v4, v5, v4
; VI-NEXT: v_or_b32_e32 v5, v7, v3
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: v_perm_b32 v4, v4, v5, s8
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0
+; VI-NEXT: v_perm_b32 v4, v4, v5, s12
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
@@ -1628,21 +1632,23 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: load_v4i8_to_v4f32_2_uses:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s10, 0
-; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, s7
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s0, s6
-; SI-NEXT: s_mov_b32 s1, s7
-; SI-NEXT: s_mov_b32 s6, s2
-; SI-NEXT: s_mov_b32 s7, s3
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_mov_b32 s8, s2
+; SI-NEXT: s_mov_b32 s9, s3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; SI-NEXT: v_and_b32_e32 v6, 0xff00, v4
@@ -1664,29 +1670,31 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: load_v4i8_to_v4f32_2_uses:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v5, 0xffffff00
+; VI-NEXT: v_mov_b32_e32 v6, 9
+; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: v_mov_b32_e32 v6, 9
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v7, 0x900
+; VI-NEXT: s_mov_b32 s10, s6
+; VI-NEXT: s_mov_b32 s11, s7
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s8, s2
+; VI-NEXT: s_mov_b32 s9, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
@@ -1696,14 +1704,14 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; VI-NEXT: v_add_u16_e32 v9, 9, v4
; VI-NEXT: v_and_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_nop 0
; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v1, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_e32 v0, 0x900, v0
; VI-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: load_v4i8_to_v4f32_2_uses:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll
index 6507976872410..50e4fd5de14c3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll
@@ -180,12 +180,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr
; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GCN-NEXT: v_lshlrev_b32_e32 v16, 5, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v48, 5, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_add_nc_u32_e32 v17, s0, v16
-; GCN-NEXT: v_add_nc_u32_e32 v16, s1, v16
-; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:16
-; GCN-NEXT: ds_load_b128 v[0:3], v17
+; GCN-NEXT: v_add_nc_u32_e32 v32, s0, v48
+; GCN-NEXT: v_dual_mov_b32 v57, s1 :: v_dual_add_nc_u32 v56, s1, v48
+; GCN-NEXT: ds_load_b128 v[4:7], v32 offset:16
+; GCN-NEXT: ds_load_b128 v[0:3], v32
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
@@ -194,66 +194,61 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr
; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
+; GCN-NEXT: ds_load_b128 v[4:7], v32 offset:2064
+; GCN-NEXT: ds_load_b128 v[0:3], v32 offset:2048
+; GCN-NEXT: ds_load_b128 v[20:23], v32 offset:6160
+; GCN-NEXT: ds_load_b128 v[16:19], v32 offset:6144
+; GCN-NEXT: ds_load_b128 v[28:31], v32 offset:12304
+; GCN-NEXT: ds_load_b128 v[24:27], v32 offset:12288
+; GCN-NEXT: ds_load_b128 v[36:39], v32 offset:20496
+; GCN-NEXT: ds_load_b128 v[32:35], v32 offset:20480
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:16
-; GCN-NEXT: ds_store_b128 v16, v[8:11]
-; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:2064
-; GCN-NEXT: ds_load_b128 v[0:3], v17 offset:2048
-; GCN-NEXT: v_mov_b32_e32 v16, s1
+; GCN-NEXT: s_waitcnt lgkmcnt(2)
+; GCN-NEXT: v_dual_mov_b32 v55, v31 :: v_dual_mov_b32 v54, v30
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_dual_mov_b32 v47, v39 :: v_dual_mov_b32 v46, v38
+; GCN-NEXT: v_dual_mov_b32 v45, v37 :: v_dual_mov_b32 v44, v36
+; GCN-NEXT: v_dual_mov_b32 v43, v35 :: v_dual_mov_b32 v42, v34
+; GCN-NEXT: v_dual_mov_b32 v41, v33 :: v_dual_mov_b32 v40, v32
+; GCN-NEXT: v_dual_mov_b32 v53, v29 :: v_dual_mov_b32 v52, v28
+; GCN-NEXT: v_dual_mov_b32 v51, v27 :: v_dual_mov_b32 v50, v26
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[40:47], v[32:39], v[32:39], v[40:47]
+; GCN-NEXT: v_dual_mov_b32 v39, v7 :: v_dual_mov_b32 v38, v6
+; GCN-NEXT: v_dual_mov_b32 v37, v5 :: v_dual_mov_b32 v36, v4
+; GCN-NEXT: v_dual_mov_b32 v35, v3 :: v_dual_mov_b32 v34, v2
+; GCN-NEXT: v_dual_mov_b32 v33, v1 :: v_dual_mov_b32 v32, v0
+; GCN-NEXT: v_dual_mov_b32 v49, v25 :: v_dual_mov_b32 v48, v24
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[32:39], v[0:7], v[0:7], v[32:39]
+; GCN-NEXT: v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v17
+; GCN-NEXT: v_dual_mov_b32 v2, v18 :: v_dual_mov_b32 v3, v19
+; GCN-NEXT: v_dual_mov_b32 v4, v20 :: v_dual_mov_b32 v5, v21
+; GCN-NEXT: v_dual_mov_b32 v6, v22 :: v_dual_mov_b32 v7, v23
+; GCN-NEXT: ds_store_b128 v56, v[12:15] offset:16
+; GCN-NEXT: ds_store_b128 v56, v[8:11]
+; GCN-NEXT: ds_store_b128 v57, v[36:39] offset:2064
+; GCN-NEXT: ds_store_b128 v57, v[32:35] offset:2048
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[0:7], v[16:23], v[16:23], v[0:7]
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
-; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
-; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
-; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:2064
-; GCN-NEXT: ds_store_b128 v16, v[8:11] offset:2048
-; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:6160
-; GCN-NEXT: ds_load_b128 v[0:3], v17 offset:6144
+; GCN-NEXT: ds_store_b128 v57, v[4:7] offset:4112
+; GCN-NEXT: ds_store_b128 v57, v[0:3] offset:4096
+; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[48:55], v[24:31], v[24:31], v[48:55]
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
-; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
-; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
-; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:4112
-; GCN-NEXT: ds_store_b128 v16, v[8:11] offset:4096
-; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:12304
-; GCN-NEXT: ds_load_b128 v[0:3], v17 offset:12288
+; GCN-NEXT: ds_store_b128 v57, v[52:55] offset:6160
+; GCN-NEXT: ds_store_b128 v57, v[48:51] offset:6144
+; GCN-NEXT: ds_store_b128 v57, v[44:47] offset:8208
+; GCN-NEXT: ds_store_b128 v57, v[40:43] offset:8192
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
-; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
-; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
-; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:6160
-; GCN-NEXT: ds_store_b128 v16, v[8:11] offset:6144
-; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:20496
-; GCN-NEXT: ds_load_b128 v[0:3], v17 offset:20480
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
-; GCN-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
-; GCN-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
-; GCN-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT: ds_store_b128 v16, v[12:15] offset:8208
-; GCN-NEXT: ds_store_b128 v16, v[8:11] offset:8192
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
; GCN-NEXT: s_endpgm
;
@@ -262,12 +257,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr
; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 5, v0
+; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v48, 5, v0
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v17, s0, v16
-; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v16, s1, v16
-; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:16
-; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17
+; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v32, s0, v48
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v57, s1 :: v_dual_add_nc_u32 v56, s1, v48
+; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v32 offset:16
+; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v32
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
@@ -276,66 +271,61 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr
; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1)
; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
+; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v32 offset:2064
+; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v32 offset:2048
+; EXACTCUTOFF-NEXT: ds_load_b128 v[20:23], v32 offset:6160
+; EXACTCUTOFF-NEXT: ds_load_b128 v[16:19], v32 offset:6144
+; EXACTCUTOFF-NEXT: ds_load_b128 v[28:31], v32 offset:12304
+; EXACTCUTOFF-NEXT: ds_load_b128 v[24:27], v32 offset:12288
+; EXACTCUTOFF-NEXT: ds_load_b128 v[36:39], v32 offset:20496
+; EXACTCUTOFF-NEXT: ds_load_b128 v[32:35], v32 offset:20480
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:16
-; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[8:11]
-; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:2064
-; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 offset:2048
-; EXACTCUTOFF-NEXT: v_mov_b32_e32 v16, s1
+; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(2)
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v55, v31 :: v_dual_mov_b32 v54, v30
+; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v47, v39 :: v_dual_mov_b32 v46, v38
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v45, v37 :: v_dual_mov_b32 v44, v36
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v43, v35 :: v_dual_mov_b32 v42, v34
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v41, v33 :: v_dual_mov_b32 v40, v32
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v53, v29 :: v_dual_mov_b32 v52, v28
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v51, v27 :: v_dual_mov_b32 v50, v26
+; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[40:47], v[32:39], v[32:39], v[40:47]
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v39, v7 :: v_dual_mov_b32 v38, v6
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v37, v5 :: v_dual_mov_b32 v36, v4
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v35, v3 :: v_dual_mov_b32 v34, v2
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v33, v1 :: v_dual_mov_b32 v32, v0
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v49, v25 :: v_dual_mov_b32 v48, v24
+; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[32:39], v[0:7], v[0:7], v[32:39]
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v17
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v2, v18 :: v_dual_mov_b32 v3, v19
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v4, v20 :: v_dual_mov_b32 v5, v21
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v6, v22 :: v_dual_mov_b32 v7, v23
+; EXACTCUTOFF-NEXT: ds_store_b128 v56, v[12:15] offset:16
+; EXACTCUTOFF-NEXT: ds_store_b128 v56, v[8:11]
+; EXACTCUTOFF-NEXT: ds_store_b128 v57, v[36:39] offset:2064
+; EXACTCUTOFF-NEXT: ds_store_b128 v57, v[32:35] offset:2048
+; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[0:7], v[16:23], v[16:23], v[0:7]
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
-; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:2064
-; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[8:11] offset:2048
-; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:6160
-; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 offset:6144
+; EXACTCUTOFF-NEXT: ds_store_b128 v57, v[4:7] offset:4112
+; EXACTCUTOFF-NEXT: ds_store_b128 v57, v[0:3] offset:4096
+; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[48:55], v[24:31], v[24:31], v[48:55]
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
-; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:4112
-; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[8:11] offset:4096
-; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:12304
-; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 offset:12288
+; EXACTCUTOFF-NEXT: ds_store_b128 v57, v[52:55] offset:6160
+; EXACTCUTOFF-NEXT: ds_store_b128 v57, v[48:51] offset:6144
+; EXACTCUTOFF-NEXT: ds_store_b128 v57, v[44:47] offset:8208
+; EXACTCUTOFF-NEXT: ds_store_b128 v57, v[40:43] offset:8192
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
-; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:6160
-; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[8:11] offset:6144
-; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:20496
-; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 offset:20480
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
-; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
-; EXACTCUTOFF-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; EXACTCUTOFF-NEXT: v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[12:15] offset:8208
-; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[8:11] offset:8192
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
; EXACTCUTOFF-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
index 371b4f070094d..f4c21b01bf7f6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
@@ -9,265 +9,957 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-MINREG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-MINREG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v0, 7, v0
-; GCN-MINREG-NEXT: v_mov_b32_e32 v2, 1.0
-; GCN-MINREG-NEXT: v_mov_b32_e32 v1, 2.0
+; GCN-MINREG-NEXT: v_mov_b32_e32 v40, 1.0
+; GCN-MINREG-NEXT: v_mov_b32_e32 v39, 2.0
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT: v_add_u32_e32 v4, s0, v0
-; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:112
-; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:96
-; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:80
-; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:64
-; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4
-; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:16
-; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:32
-; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:48
+; GCN-MINREG-NEXT: v_add_u32_e32 v5, s0, v0
+; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v5 offset:112
+; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v5 offset:96
+; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v5 offset:80
+; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v5 offset:64
+; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v5
+; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v5 offset:16
+; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v5 offset:32
+; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v5 offset:48
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
-; GCN-MINREG-NEXT: v_add_u32_e32 v5, s1, v0
-; GCN-MINREG-NEXT: v_mov_b32_e32 v0, s1
-; GCN-MINREG-NEXT: v_add_u32_e32 v3, 0x6000, v4
+; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v40, v39, a[0:31]
+; GCN-MINREG-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCN-MINREG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCN-MINREG-NEXT: ds_read_b128 v[6:9], v5 offset:8192
+; GCN-MINREG-NEXT: s_mov_b32 s14, -1
+; GCN-MINREG-NEXT: s_mov_b32 s15, 0xe00000
+; GCN-MINREG-NEXT: s_add_u32 s12, s12, s11
+; GCN-MINREG-NEXT: s_addc_u32 s13, s13, 0
+; GCN-MINREG-NEXT: ds_read_b128 v[34:37], v5 offset:8304
+; GCN-MINREG-NEXT: ds_read_b128 v[30:33], v5 offset:8288
+; GCN-MINREG-NEXT: ds_read_b128 v[26:29], v5 offset:8272
+; GCN-MINREG-NEXT: ds_read_b128 v[22:25], v5 offset:8256
+; GCN-MINREG-NEXT: ds_read_b128 v[18:21], v5 offset:8240
+; GCN-MINREG-NEXT: ds_read_b128 v[14:17], v5 offset:8224
+; GCN-MINREG-NEXT: ds_read_b128 v[10:13], v5 offset:8208
+; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(7)
+; GCN-MINREG-NEXT: buffer_store_dword v6, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: v_add_u32_e32 v4, 0x6000, v5
+; GCN-MINREG-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(6)
+; GCN-MINREG-NEXT: buffer_store_dword v37, off, s[12:15], 0 offset:124 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: ds_read_b128 v[6:9], v5 offset:24576
+; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(1)
+; GCN-MINREG-NEXT: buffer_store_dword v10, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v11, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v12, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v13, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v14, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v16, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:52 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:56 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:60 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v22, off, s[12:15], 0 offset:64 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v23, off, s[12:15], 0 offset:68 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v24, off, s[12:15], 0 offset:72 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v25, off, s[12:15], 0 offset:76 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v26, off, s[12:15], 0 offset:80 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v27, off, s[12:15], 0 offset:84 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v28, off, s[12:15], 0 offset:88 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v29, off, s[12:15], 0 offset:92 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v30, off, s[12:15], 0 offset:96 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v31, off, s[12:15], 0 offset:100 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v32, off, s[12:15], 0 offset:104 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v33, off, s[12:15], 0 offset:108 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v34, off, s[12:15], 0 offset:112 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v35, off, s[12:15], 0 offset:116 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v36, off, s[12:15], 0 offset:120 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: ds_read_b128 v[34:37], v5 offset:24688
+; GCN-MINREG-NEXT: ds_read_b128 v[30:33], v5 offset:24672
+; GCN-MINREG-NEXT: ds_read_b128 v[26:29], v5 offset:24656
+; GCN-MINREG-NEXT: ds_read_b128 v[22:25], v5 offset:24640
+; GCN-MINREG-NEXT: ds_read_b128 v[18:21], v5 offset:24624
+; GCN-MINREG-NEXT: ds_read_b128 v[14:17], v5 offset:24608
+; GCN-MINREG-NEXT: ds_read_b128 v[10:13], v5 offset:24592
+; GCN-MINREG-NEXT: ds_read_b128 a[60:63], v5 offset:49264
+; GCN-MINREG-NEXT: ds_read_b128 a[56:59], v5 offset:49248
+; GCN-MINREG-NEXT: ds_read_b128 a[52:55], v5 offset:49232
+; GCN-MINREG-NEXT: ds_read_b128 a[48:51], v5 offset:49216
+; GCN-MINREG-NEXT: ds_read_b128 a[44:47], v5 offset:49200
+; GCN-MINREG-NEXT: ds_read_b128 a[40:43], v5 offset:49184
+; GCN-MINREG-NEXT: ds_read_b128 a[36:39], v5 offset:49168
+; GCN-MINREG-NEXT: ds_read_b128 a[32:35], v5 offset:49152
+; GCN-MINREG-NEXT: buffer_store_dword a0, off, s[12:15], 0 offset:128 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: v_add_u32_e32 v41, s1, v0
+; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(14)
+; GCN-MINREG-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:256 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v63, a10 ; Reload Reuse
+; GCN-MINREG-NEXT: buffer_store_dword a1, off, s[12:15], 0 offset:132 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword a2, off, s[12:15], 0 offset:136 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword a3, off, s[12:15], 0 offset:140 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword a4, off, s[12:15], 0 offset:144 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword a5, off, s[12:15], 0 offset:148 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword a6, off, s[12:15], 0 offset:152 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword a7, off, s[12:15], 0 offset:156 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword a8, off, s[12:15], 0 offset:160 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword a9, off, s[12:15], 0 offset:164 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v62, a11 ; Reload Reuse
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v61, a12 ; Reload Reuse
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v60, a13 ; Reload Reuse
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v58, a15 ; Reload Reuse
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v57, a16 ; Reload Reuse
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v56, a17 ; Reload Reuse
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v55, a18 ; Reload Reuse
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v54, a19 ; Reload Reuse
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v53, a20 ; Reload Reuse
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v52, a21 ; Reload Reuse
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v51, a22 ; Reload Reuse
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v50, a23 ; Reload Reuse
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v49, a24 ; Reload Reuse
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v48, a25 ; Reload Reuse
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v47, a26 ; Reload Reuse
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v46, a27 ; Reload Reuse
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v45, a28 ; Reload Reuse
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v44, a29 ; Reload Reuse
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v43, a30 ; Reload Reuse
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v42, a31 ; Reload Reuse
+; GCN-MINREG-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:260 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:264 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:268 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(8)
+; GCN-MINREG-NEXT: buffer_store_dword v10, off, s[12:15], 0 offset:272 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v11, off, s[12:15], 0 offset:276 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v12, off, s[12:15], 0 offset:280 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v13, off, s[12:15], 0 offset:284 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v14, off, s[12:15], 0 offset:288 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:292 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v16, off, s[12:15], 0 offset:296 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:300 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:304 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:308 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:312 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:316 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v22, off, s[12:15], 0 offset:320 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v23, off, s[12:15], 0 offset:324 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v24, off, s[12:15], 0 offset:328 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v25, off, s[12:15], 0 offset:332 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v26, off, s[12:15], 0 offset:336 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v27, off, s[12:15], 0 offset:340 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v28, off, s[12:15], 0 offset:344 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v29, off, s[12:15], 0 offset:348 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v30, off, s[12:15], 0 offset:352 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v31, off, s[12:15], 0 offset:356 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v37, off, s[12:15], 0 offset:380 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v40, v39, a[32:63]
+; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:57456
+; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:57440
+; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:57424
+; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:57408
+; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:57344
+; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:57360
+; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:57376
+; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:57392
+; GCN-MINREG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:128 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:132 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:136 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:140 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:144 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:148 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:152 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:156 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:160 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:164 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: v_mov_b32_e32 v38, s1
+; GCN-MINREG-NEXT: v_mov_b32_e32 v10, v63
+; GCN-MINREG-NEXT: v_mov_b32_e32 v11, v62
+; GCN-MINREG-NEXT: v_mov_b32_e32 v12, v61
+; GCN-MINREG-NEXT: v_mov_b32_e32 v13, v60
+; GCN-MINREG-NEXT: v_mov_b32_e32 v14, v59
+; GCN-MINREG-NEXT: v_mov_b32_e32 v15, v58
+; GCN-MINREG-NEXT: v_mov_b32_e32 v16, v57
+; GCN-MINREG-NEXT: v_mov_b32_e32 v17, v56
+; GCN-MINREG-NEXT: v_mov_b32_e32 v18, v55
+; GCN-MINREG-NEXT: v_mov_b32_e32 v19, v54
+; GCN-MINREG-NEXT: v_mov_b32_e32 v20, v53
+; GCN-MINREG-NEXT: v_mov_b32_e32 v21, v52
+; GCN-MINREG-NEXT: v_mov_b32_e32 v22, v51
+; GCN-MINREG-NEXT: v_mov_b32_e32 v23, v50
+; GCN-MINREG-NEXT: v_mov_b32_e32 v24, v49
+; GCN-MINREG-NEXT: v_mov_b32_e32 v25, v48
+; GCN-MINREG-NEXT: v_mov_b32_e32 v26, v47
+; GCN-MINREG-NEXT: v_mov_b32_e32 v27, v46
+; GCN-MINREG-NEXT: v_mov_b32_e32 v28, v45
+; GCN-MINREG-NEXT: v_mov_b32_e32 v29, v44
+; GCN-MINREG-NEXT: v_mov_b32_e32 v30, v43
+; GCN-MINREG-NEXT: s_waitcnt vmcnt(0)
+; GCN-MINREG-NEXT: v_mov_b32_e32 v31, v42
+; GCN-MINREG-NEXT: ds_write_b128 v41, v[28:31] offset:112
+; GCN-MINREG-NEXT: ds_write_b128 v41, v[24:27] offset:96
+; GCN-MINREG-NEXT: ds_write_b128 v41, v[20:23] offset:80
+; GCN-MINREG-NEXT: ds_write_b128 v41, v[16:19] offset:64
+; GCN-MINREG-NEXT: ds_write_b128 v41, v[12:15] offset:48
+; GCN-MINREG-NEXT: buffer_store_dword v32, off, s[12:15], 0 offset:360 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v33, off, s[12:15], 0 offset:364 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v34, off, s[12:15], 0 offset:368 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v35, off, s[12:15], 0 offset:372 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT: buffer_store_dword v36, off, s[12:15], 0 offset:376 ; 4-byte Folded Spill
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT: s_nop 7
-; GCN-MINREG-NEXT: s_nop 7
-; GCN-MINREG-NEXT: ds_write_b128 v5, a[28:31] offset:112
-; GCN-MINREG-NEXT: ds_write_b128 v5, a[24:27] offset:96
-; GCN-MINREG-NEXT: ds_write_b128 v5, a[20:23] offset:80
-; GCN-MINREG-NEXT: ds_write_b128 v5, a[16:19] offset:64
-; GCN-MINREG-NEXT: ds_write_b128 v5, a[12:15] offset:48
-; GCN-MINREG-NEXT: ds_write_b128 v5, a[8:11] offset:32
-; GCN-MINREG-NEXT: ds_write_b128 v5, a[4:7] offset:16
-; GCN-MINREG-NEXT: ds_write_b128 v5, a[0:3]
-; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:8304
-; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:8288
-; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:8272
-; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:8256
-; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:8240
-; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:8224
-; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:8208
-; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:8192
-; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT: s_nop 7
-; GCN-MINREG-NEXT: s_nop 7
-; GCN-MINREG-NEXT: s_nop 2
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:8288
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:8304
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:8256
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:8272
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:8224
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:8240
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:8192
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:8208
-; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:24688
-; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:24672
-; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:24656
-; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:24640
-; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:24624
-; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:24608
-; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:24592
-; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:24576
-; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT: s_nop 7
-; GCN-MINREG-NEXT: s_nop 7
-; GCN-MINREG-NEXT: s_nop 2
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:16480
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:16496
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:16448
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:16464
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:16416
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:16432
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:16384
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:16400
-; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:49264
-; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:49248
-; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:49232
-; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:49216
-; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:49200
-; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:49184
-; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:49168
-; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:49152
-; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
+; GCN-MINREG-NEXT: ds_write_b128 v41, v[0:3]
+; GCN-MINREG-NEXT: ds_write_b128 v41, v[4:7] offset:16
+; GCN-MINREG-NEXT: ds_write_b128 v41, v[8:11] offset:32
+; GCN-MINREG-NEXT: ds_write_b128 v38, a[56:59] offset:24672
+; GCN-MINREG-NEXT: ds_write_b128 v38, a[60:63] offset:24688
+; GCN-MINREG-NEXT: ds_write_b128 v38, a[48:51] offset:24640
+; GCN-MINREG-NEXT: ds_write_b128 v38, a[52:55] offset:24656
+; GCN-MINREG-NEXT: ds_write_b128 v38, a[40:43] offset:24608
+; GCN-MINREG-NEXT: ds_write_b128 v38, a[44:47] offset:24624
+; GCN-MINREG-NEXT: ds_write_b128 v38, a[32:35] offset:24576
+; GCN-MINREG-NEXT: ds_write_b128 v38, a[36:39] offset:24592
+; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(14)
+; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v40, v39, a[0:31]
+; GCN-MINREG-NEXT: buffer_load_dword a32, off, s[12:15], 0 offset:256 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a33, off, s[12:15], 0 offset:260 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a34, off, s[12:15], 0 offset:264 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a35, off, s[12:15], 0 offset:268 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a36, off, s[12:15], 0 offset:272 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a37, off, s[12:15], 0 offset:276 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a38, off, s[12:15], 0 offset:280 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a39, off, s[12:15], 0 offset:284 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a40, off, s[12:15], 0 offset:288 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a41, off, s[12:15], 0 offset:292 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a42, off, s[12:15], 0 offset:296 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a43, off, s[12:15], 0 offset:300 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a44, off, s[12:15], 0 offset:304 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a45, off, s[12:15], 0 offset:308 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a46, off, s[12:15], 0 offset:312 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a47, off, s[12:15], 0 offset:316 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a48, off, s[12:15], 0 offset:320 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a49, off, s[12:15], 0 offset:324 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a50, off, s[12:15], 0 offset:328 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: ds_write_b128 v38, a[24:27] offset:32864
+; GCN-MINREG-NEXT: ds_write_b128 v38, a[28:31] offset:32880
+; GCN-MINREG-NEXT: ds_write_b128 v38, a[16:19] offset:32832
+; GCN-MINREG-NEXT: ds_write_b128 v38, a[20:23] offset:32848
+; GCN-MINREG-NEXT: ds_write_b128 v38, a[8:11] offset:32800
+; GCN-MINREG-NEXT: ds_write_b128 v38, a[12:15] offset:32816
+; GCN-MINREG-NEXT: ds_write_b128 v38, a[0:3] offset:32768
+; GCN-MINREG-NEXT: ds_write_b128 v38, a[4:7] offset:32784
+; GCN-MINREG-NEXT: buffer_load_dword a0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a4, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a5, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a6, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a7, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a8, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a9, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a10, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a11, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a12, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a13, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a14, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a15, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a16, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a17, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a18, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a19, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a20, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a21, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a22, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a23, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a24, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a25, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a26, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a27, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a28, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a29, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a30, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a31, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a51, off, s[12:15], 0 offset:332 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a52, off, s[12:15], 0 offset:336 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a53, off, s[12:15], 0 offset:340 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a54, off, s[12:15], 0 offset:344 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a55, off, s[12:15], 0 offset:348 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a56, off, s[12:15], 0 offset:352 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a57, off, s[12:15], 0 offset:356 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a58, off, s[12:15], 0 offset:360 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a59, off, s[12:15], 0 offset:364 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a60, off, s[12:15], 0 offset:368 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a61, off, s[12:15], 0 offset:372 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a62, off, s[12:15], 0 offset:376 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: buffer_load_dword a63, off, s[12:15], 0 offset:380 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT: s_waitcnt vmcnt(13)
+; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v40, v39, a[0:31]
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT: s_nop 7
-; GCN-MINREG-NEXT: s_nop 7
-; GCN-MINREG-NEXT: s_nop 2
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:24672
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:24688
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:24640
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:24656
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:24608
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:24624
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:24576
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:24592
-; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:57456
-; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:57440
-; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:57424
-; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:57408
-; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:57344
-; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:57360
-; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:57376
-; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:57392
-; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: s_nop 7
; GCN-MINREG-NEXT: s_nop 7
; GCN-MINREG-NEXT: s_nop 2
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:32864
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:32880
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:32832
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:32848
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:32800
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:32816
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:32768
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:32784
-; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v33, a31
+; GCN-MINREG-NEXT: s_waitcnt vmcnt(0)
+; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v40, v39, a[32:63]
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v32, a30
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v31, a29
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v30, a28
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v29, a27
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v28, a26
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v27, a25
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v26, a24
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v25, a23
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v24, a22
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v23, a21
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v22, a20
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v21, a19
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v20, a18
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v19, a17
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v18, a16
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v17, a15
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v16, a14
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v15, a13
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v14, a12
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v13, a11
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v12, a10
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v11, a9
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v10, a8
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v9, a7
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v8, a6
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v7, a5
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v6, a4
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v5, a3
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v4, a2
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v3, a1
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v2, a0
+; GCN-MINREG-NEXT: ds_write_b128 v38, a[56:59] offset:16480
+; GCN-MINREG-NEXT: ds_write_b128 v38, a[60:63] offset:16496
+; GCN-MINREG-NEXT: ds_write_b128 v38, a[48:51] offset:16448
+; GCN-MINREG-NEXT: ds_write_b128 v38, a[52:55] offset:16464
+; GCN-MINREG-NEXT: ds_write_b128 v38, a[40:43] offset:16416
+; GCN-MINREG-NEXT: ds_write_b128 v38, a[44:47] offset:16432
+; GCN-MINREG-NEXT: ds_write_b128 v38, a[32:35] offset:16384
+; GCN-MINREG-NEXT: ds_write_b128 v38, a[36:39] offset:16400
+; GCN-MINREG-NEXT: ds_write_b128 v38, v[26:29] offset:8288
+; GCN-MINREG-NEXT: ds_write_b128 v38, v[30:33] offset:8304
+; GCN-MINREG-NEXT: ds_write_b128 v38, v[18:21] offset:8256
+; GCN-MINREG-NEXT: ds_write_b128 v38, v[22:25] offset:8272
+; GCN-MINREG-NEXT: ds_write_b128 v38, v[10:13] offset:8224
+; GCN-MINREG-NEXT: ds_write_b128 v38, v[14:17] offset:8240
+; GCN-MINREG-NEXT: ds_write_b128 v38, v[2:5] offset:8192
+; GCN-MINREG-NEXT: ds_write_b128 v38, v[6:9] offset:8208
; GCN-MINREG-NEXT: s_endpgm
;
; GCN-MAXOCC-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave:
; GCN-MAXOCC: ; %bb.0: ; %entry
; GCN-MAXOCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-MAXOCC-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v1, 7, v0
-; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 1.0
-; GCN-MAXOCC-NEXT: v_mov_b32_e32 v3, 2.0
+; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; GCN-MAXOCC-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCN-MAXOCC-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, s0, v1
-; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:112
-; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:96
-; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:80
-; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:64
-; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0
-; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:16
-; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:32
-; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:48
+; GCN-MAXOCC-NEXT: v_add_u32_e32 v1, s0, v0
+; GCN-MAXOCC-NEXT: ds_read_b128 v[2:5], v1
+; GCN-MAXOCC-NEXT: ds_read_b128 v[30:33], v1 offset:112
+; GCN-MAXOCC-NEXT: ds_read_b128 v[26:29], v1 offset:96
+; GCN-MAXOCC-NEXT: ds_read_b128 v[22:25], v1 offset:80
+; GCN-MAXOCC-NEXT: ds_read_b128 v[18:21], v1 offset:64
+; GCN-MAXOCC-NEXT: ds_read_b128 v[6:9], v1 offset:16
+; GCN-MAXOCC-NEXT: ds_read_b128 v[10:13], v1 offset:32
+; GCN-MAXOCC-NEXT: ds_read_b128 v[14:17], v1 offset:48
; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a0, v2
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a1, v3
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a2, v4
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a3, v5
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a4, v6
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a5, v7
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a6, v8
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a7, v9
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a8, v10
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a9, v11
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a10, v12
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a11, v13
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a12, v14
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a13, v15
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a14, v16
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a15, v17
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a16, v18
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a17, v19
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a18, v20
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a19, v21
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a20, v22
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a21, v23
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a22, v24
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a23, v25
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a24, v26
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a25, v27
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a26, v28
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a27, v29
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a28, v30
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a29, v31
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a30, v32
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a31, v33
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 1.0
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v3, 2.0
+; GCN-MAXOCC-NEXT: s_mov_b32 s14, -1
+; GCN-MAXOCC-NEXT: s_mov_b32 s15, 0xe00000
; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-MAXOCC-NEXT: v_add_u32_e32 v1, s1, v1
+; GCN-MAXOCC-NEXT: s_add_u32 s12, s12, s11
+; GCN-MAXOCC-NEXT: s_addc_u32 s13, s13, 0
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v4, v3
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v3, v2
+; GCN-MAXOCC-NEXT: v_add_u32_e32 v2, 0x6000, v1
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 7
-; GCN-MAXOCC-NEXT: s_nop 7
-; GCN-MAXOCC-NEXT: s_nop 1
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:112
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:96
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:80
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:64
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:48
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:32
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:16
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3]
-; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:8304
-; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:8288
-; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:8272
-; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:8256
-; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:8240
-; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:8224
-; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:8208
-; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:8192
+; GCN-MAXOCC-NEXT: s_nop 5
+; GCN-MAXOCC-NEXT: buffer_store_dword a0, off, s[12:15], 0 offset:128 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v63, a4 ; Reload Reuse
+; GCN-MAXOCC-NEXT: buffer_store_dword a1, off, s[12:15], 0 offset:132 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a2, off, s[12:15], 0 offset:136 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a3, off, s[12:15], 0 offset:140 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v62, a5 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v61, a6 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v60, a7 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v59, a8 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v58, a9 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v57, a10 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v55, a12 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v54, a13 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v53, a14 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v52, a15 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v51, a16 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v50, a17 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v49, a18 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v48, a19 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v47, a20 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v46, a21 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v45, a22 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v44, a23 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v43, a24 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v42, a25 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v41, a26 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v40, a27 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v39, a28 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v38, a29 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v37, a30 ; Reload Reuse
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v36, a31 ; Reload Reuse
+; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v1 offset:8304
+; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v1 offset:8288
+; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v1 offset:8272
+; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v1 offset:8256
+; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v1 offset:8240
+; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v1 offset:8224
+; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v1 offset:8208
+; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v1 offset:8192
; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-MAXOCC-NEXT: v_mov_b32_e32 v1, s1
-; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
-; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
-; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v4, a[0:31]
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 7
-; GCN-MAXOCC-NEXT: s_nop 1
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:8288
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:8304
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:8256
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:8272
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:8224
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:8240
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:8192
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:8208
-; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:24688
-; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:24672
-; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:24656
-; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:24640
-; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:24624
-; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:24608
-; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:24592
-; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:24576
+; GCN-MAXOCC-NEXT: s_nop 2
+; GCN-MAXOCC-NEXT: buffer_store_dword a0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: s_nop 0
+; GCN-MAXOCC-NEXT: buffer_store_dword a1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a4, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a5, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a6, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a7, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a8, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a9, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a10, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a11, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a12, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a13, off, s[12:15], 0 offset:52 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a14, off, s[12:15], 0 offset:56 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a15, off, s[12:15], 0 offset:60 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a16, off, s[12:15], 0 offset:64 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a17, off, s[12:15], 0 offset:68 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a18, off, s[12:15], 0 offset:72 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a19, off, s[12:15], 0 offset:76 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a20, off, s[12:15], 0 offset:80 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a21, off, s[12:15], 0 offset:84 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a22, off, s[12:15], 0 offset:88 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a23, off, s[12:15], 0 offset:92 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a24, off, s[12:15], 0 offset:96 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a25, off, s[12:15], 0 offset:100 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a26, off, s[12:15], 0 offset:104 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a27, off, s[12:15], 0 offset:108 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a28, off, s[12:15], 0 offset:112 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a29, off, s[12:15], 0 offset:116 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a30, off, s[12:15], 0 offset:120 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword a31, off, s[12:15], 0 offset:124 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v1 offset:24688
+; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v1 offset:24672
+; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v1 offset:24656
+; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v1 offset:24640
+; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v1 offset:24624
+; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v1 offset:24608
+; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v1 offset:24592
+; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v1 offset:24576
+; GCN-MAXOCC-NEXT: ds_read_b128 a[60:63], v1 offset:49264
+; GCN-MAXOCC-NEXT: ds_read_b128 a[56:59], v1 offset:49248
+; GCN-MAXOCC-NEXT: ds_read_b128 a[52:55], v1 offset:49232
+; GCN-MAXOCC-NEXT: ds_read_b128 a[48:51], v1 offset:49216
+; GCN-MAXOCC-NEXT: ds_read_b128 a[44:47], v1 offset:49200
+; GCN-MAXOCC-NEXT: ds_read_b128 a[40:43], v1 offset:49184
+; GCN-MAXOCC-NEXT: ds_read_b128 a[36:39], v1 offset:49168
+; GCN-MAXOCC-NEXT: ds_read_b128 a[32:35], v1 offset:49152
+; GCN-MAXOCC-NEXT: ds_read_b128 v[4:7], v2 offset:57344
+; GCN-MAXOCC-NEXT: ds_read_b128 v[32:35], v2 offset:57456
+; GCN-MAXOCC-NEXT: ds_read_b128 v[28:31], v2 offset:57440
+; GCN-MAXOCC-NEXT: ds_read_b128 v[24:27], v2 offset:57424
+; GCN-MAXOCC-NEXT: ds_read_b128 v[20:23], v2 offset:57408
+; GCN-MAXOCC-NEXT: ds_read_b128 v[8:11], v2 offset:57360
+; GCN-MAXOCC-NEXT: ds_read_b128 v[12:15], v2 offset:57376
+; GCN-MAXOCC-NEXT: ds_read_b128 v[16:19], v2 offset:57392
+; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(7)
+; GCN-MAXOCC-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:256 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: s_nop 0
+; GCN-MAXOCC-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:260 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:264 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:268 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(2)
+; GCN-MAXOCC-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:272 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:276 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v10, off, s[12:15], 0 offset:280 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v11, off, s[12:15], 0 offset:284 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(1)
+; GCN-MAXOCC-NEXT: buffer_store_dword v12, off, s[12:15], 0 offset:288 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v13, off, s[12:15], 0 offset:292 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v14, off, s[12:15], 0 offset:296 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:300 ; 4-byte Folded Spill
; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GCN-MAXOCC-NEXT: buffer_store_dword v16, off, s[12:15], 0 offset:304 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:308 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:312 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:316 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:320 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:324 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v22, off, s[12:15], 0 offset:328 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v23, off, s[12:15], 0 offset:332 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v24, off, s[12:15], 0 offset:336 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v25, off, s[12:15], 0 offset:340 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v26, off, s[12:15], 0 offset:344 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v27, off, s[12:15], 0 offset:348 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v28, off, s[12:15], 0 offset:352 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v29, off, s[12:15], 0 offset:356 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v30, off, s[12:15], 0 offset:360 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v31, off, s[12:15], 0 offset:364 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v32, off, s[12:15], 0 offset:368 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: buffer_store_dword v35, off, s[12:15], 0 offset:380 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: v_add_u32_e32 v32, s1, v0
+; GCN-MAXOCC-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:128 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:132 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:136 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:140 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v4, v63
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v5, v62
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v6, v61
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v7, v60
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v8, v59
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v9, v58
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v10, v57
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v11, v56
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v12, v55
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v13, v54
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v14, v53
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v15, v52
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v16, v51
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v17, v50
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v18, v49
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v19, v48
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v20, v47
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v21, v46
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v22, v45
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v23, v44
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v24, v43
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v25, v42
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v26, v41
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v27, v40
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v28, v39
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v29, v38
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v30, v37
+; GCN-MAXOCC-NEXT: s_waitcnt vmcnt(0)
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v31, v36
+; GCN-MAXOCC-NEXT: buffer_store_dword v33, off, s[12:15], 0 offset:372 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: ds_write_b128 v32, v[28:31] offset:112
+; GCN-MAXOCC-NEXT: ds_write_b128 v32, v[24:27] offset:96
+; GCN-MAXOCC-NEXT: ds_write_b128 v32, v[20:23] offset:80
+; GCN-MAXOCC-NEXT: ds_write_b128 v32, v[16:19] offset:64
+; GCN-MAXOCC-NEXT: ds_write_b128 v32, v[12:15] offset:48
+; GCN-MAXOCC-NEXT: ds_write_b128 v32, v[8:11] offset:32
+; GCN-MAXOCC-NEXT: ds_write_b128 v32, v[4:7] offset:16
+; GCN-MAXOCC-NEXT: buffer_store_dword v34, off, s[12:15], 0 offset:376 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT: ds_write_b128 v32, v[0:3]
+; GCN-MAXOCC-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v26, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v27, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v28, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v29, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v0, s1
+; GCN-MAXOCC-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v19, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v20, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v21, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v22, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v23, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v24, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v25, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v30, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v31, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v32, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v33, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v1, 1.0
+; GCN-MAXOCC-NEXT: s_waitcnt vmcnt(27)
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, v[26:29] offset:8288
+; GCN-MAXOCC-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v30, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v31, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v32, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v33, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v19, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v20, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v21, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v22, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v23, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v24, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v25, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v26, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v27, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v28, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v29, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: s_waitcnt vmcnt(27)
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, v[30:33] offset:8304
+; GCN-MAXOCC-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v19, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v20, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v21, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v22, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v23, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v24, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v25, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v26, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v27, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v28, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v29, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v30, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v31, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v32, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v33, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: s_waitcnt vmcnt(27)
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, v[18:21] offset:8256
+; GCN-MAXOCC-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v22, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v23, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v24, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v25, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v19, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v20, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v21, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v26, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v27, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v28, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v29, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v30, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v31, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v32, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v33, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: s_waitcnt vmcnt(27)
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, v[22:25] offset:8272
+; GCN-MAXOCC-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v19, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v20, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v21, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v22, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v23, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v24, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v25, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v26, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v27, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v28, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v29, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v30, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v31, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v32, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v33, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: s_waitcnt vmcnt(27)
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, v[10:13] offset:8224
+; GCN-MAXOCC-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v19, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v20, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v21, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v22, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v23, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v24, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v25, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v26, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v27, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v28, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v29, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v30, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v31, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v32, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v33, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: s_waitcnt vmcnt(27)
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, v[14:17] offset:8240
+; GCN-MAXOCC-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v19, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v20, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v21, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v22, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v23, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v24, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v25, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v26, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v27, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v28, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v29, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v30, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v31, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v32, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v33, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: s_waitcnt vmcnt(28)
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, v[2:5] offset:8192
+; GCN-MAXOCC-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: s_waitcnt vmcnt(0)
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 2.0
+; GCN-MAXOCC-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; GCN-MAXOCC-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v19, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v20, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v21, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v22, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v23, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[24:27] offset:16480
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[28:31] offset:16496
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[16:19] offset:16448
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[20:23] offset:16464
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[8:11] offset:16416
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[12:15] offset:16432
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[0:3] offset:16384
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[4:7] offset:16400
+; GCN-MAXOCC-NEXT: buffer_load_dword a0, off, s[12:15], 0 offset:256 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a1, off, s[12:15], 0 offset:260 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a2, off, s[12:15], 0 offset:264 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a3, off, s[12:15], 0 offset:268 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a4, off, s[12:15], 0 offset:272 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a5, off, s[12:15], 0 offset:276 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a6, off, s[12:15], 0 offset:280 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a7, off, s[12:15], 0 offset:284 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a8, off, s[12:15], 0 offset:288 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a9, off, s[12:15], 0 offset:292 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a10, off, s[12:15], 0 offset:296 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a11, off, s[12:15], 0 offset:300 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a12, off, s[12:15], 0 offset:304 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a13, off, s[12:15], 0 offset:308 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a14, off, s[12:15], 0 offset:312 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a15, off, s[12:15], 0 offset:316 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a16, off, s[12:15], 0 offset:320 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a17, off, s[12:15], 0 offset:324 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a18, off, s[12:15], 0 offset:328 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a19, off, s[12:15], 0 offset:332 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a20, off, s[12:15], 0 offset:336 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a21, off, s[12:15], 0 offset:340 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a22, off, s[12:15], 0 offset:344 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a23, off, s[12:15], 0 offset:348 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a24, off, s[12:15], 0 offset:352 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a25, off, s[12:15], 0 offset:356 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a26, off, s[12:15], 0 offset:360 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a27, off, s[12:15], 0 offset:364 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a28, off, s[12:15], 0 offset:368 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a29, off, s[12:15], 0 offset:372 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a30, off, s[12:15], 0 offset:376 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword a31, off, s[12:15], 0 offset:380 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v24, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v25, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v26, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v27, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v28, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v29, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v30, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v31, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v32, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: buffer_load_dword v33, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT: s_waitcnt vmcnt(10)
+; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; GCN-MAXOCC-NEXT: s_nop 7
+; GCN-MAXOCC-NEXT: s_nop 7
+; GCN-MAXOCC-NEXT: s_nop 2
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[24:27] offset:32864
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[28:31] offset:32880
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[16:19] offset:32832
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[20:23] offset:32848
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[8:11] offset:32800
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[12:15] offset:32816
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[0:3] offset:32768
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[4:7] offset:32784
+; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 2
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:16480
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:16496
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:16448
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:16464
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:16416
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:16432
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:16384
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:16400
-; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:49264
-; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:49248
-; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:49232
-; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:49216
-; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:49200
-; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:49184
-; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:49168
-; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:49152
-; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, 0x6000, v0
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[56:59] offset:24672
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[60:63] offset:24688
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[48:51] offset:24640
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[52:55] offset:24656
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[40:43] offset:24608
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[44:47] offset:24624
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[32:35] offset:24576
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, a[36:39] offset:24592
+; GCN-MAXOCC-NEXT: ds_write_b128 v0, v[6:9] offset:8208
+; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
+; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
+; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT: s_nop 7
-; GCN-MAXOCC-NEXT: s_nop 7
-; GCN-MAXOCC-NEXT: s_nop 1
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:24672
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:24688
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:24640
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:24656
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:24608
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:24624
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:24576
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:24592
-; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:57456
-; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:57440
-; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:57424
-; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:57408
-; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:57344
-; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:57360
-; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:57376
-; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:57392
-; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT: s_nop 7
-; GCN-MAXOCC-NEXT: s_nop 7
-; GCN-MAXOCC-NEXT: s_nop 2
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:32864
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:32880
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:32832
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:32848
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:32800
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:32816
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:32768
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:32784
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: s_endpgm
;
@@ -275,11 +967,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-ILP: ; %bb.0: ; %entry
; GCN-ILP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-ILP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GCN-ILP-NEXT: v_lshlrev_b32_e32 v0, 7, v0
-; GCN-ILP-NEXT: v_mov_b32_e32 v1, 1.0
-; GCN-ILP-NEXT: v_mov_b32_e32 v2, 2.0
+; GCN-ILP-NEXT: v_lshlrev_b32_e32 v2, 7, v0
+; GCN-ILP-NEXT: v_mov_b32_e32 v0, 1.0
+; GCN-ILP-NEXT: v_mov_b32_e32 v1, 2.0
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT: v_add_u32_e32 v3, s0, v0
+; GCN-ILP-NEXT: v_add_u32_e32 v3, s0, v2
; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:48
; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:32
; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:16
@@ -289,119 +981,355 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:96
; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:112
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-ILP-NEXT: v_add_u32_e32 v0, s1, v0
+; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GCN-ILP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCN-ILP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCN-ILP-NEXT: ds_read_b128 v[4:7], v3 offset:8192
+; GCN-ILP-NEXT: s_mov_b32 s14, -1
+; GCN-ILP-NEXT: s_mov_b32 s15, 0xe00000
+; GCN-ILP-NEXT: s_add_u32 s12, s12, s11
+; GCN-ILP-NEXT: s_addc_u32 s13, s13, 0
+; GCN-ILP-NEXT: ds_read_b128 v[32:35], v3 offset:8304
+; GCN-ILP-NEXT: ds_read_b128 v[28:31], v3 offset:8288
+; GCN-ILP-NEXT: ds_read_b128 v[24:27], v3 offset:8272
+; GCN-ILP-NEXT: ds_read_b128 v[20:23], v3 offset:8256
+; GCN-ILP-NEXT: ds_read_b128 v[16:19], v3 offset:8240
+; GCN-ILP-NEXT: ds_read_b128 v[12:15], v3 offset:8224
+; GCN-ILP-NEXT: ds_read_b128 v[8:11], v3 offset:8208
+; GCN-ILP-NEXT: s_waitcnt lgkmcnt(7)
+; GCN-ILP-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-ILP-NEXT: s_nop 7
-; GCN-ILP-NEXT: s_nop 7
-; GCN-ILP-NEXT: s_nop 1
-; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:112
-; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:96
-; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:80
-; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:64
-; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:48
-; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:32
-; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:16
-; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3]
-; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:8192
-; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:8208
-; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:8224
-; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:8240
-; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:8256
-; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:8272
-; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:8288
-; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:8304
-; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-ILP-NEXT: v_mov_b32_e32 v0, s1
-; GCN-ILP-NEXT: s_nop 7
-; GCN-ILP-NEXT: s_nop 7
-; GCN-ILP-NEXT: s_nop 1
-; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:8288
-; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:8304
-; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:8256
-; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:8272
-; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:8224
-; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:8240
-; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:8192
-; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:8208
-; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:24576
-; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:24592
-; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:24608
-; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:24624
-; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:24640
-; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:24656
-; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:24672
-; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:24688
-; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-ILP-NEXT: s_nop 7
-; GCN-ILP-NEXT: s_nop 7
; GCN-ILP-NEXT: s_nop 2
-; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:16400
-; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:49168
-; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:16384
-; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:49152
-; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:16432
-; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:49200
-; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:16416
-; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:49184
-; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:16464
-; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:49232
-; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:16448
-; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:49216
-; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:16496
-; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:49264
-; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:16480
-; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:49248
+; GCN-ILP-NEXT: buffer_store_dword a0, off, s[12:15], 0 offset:132 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v63, a4 ; Reload Reuse
+; GCN-ILP-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; GCN-ILP-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v10, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v11, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v12, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v13, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v14, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v16, off, s[12:15], 0 offset:52 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:56 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:60 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:64 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:68 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:72 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v22, off, s[12:15], 0 offset:76 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v23, off, s[12:15], 0 offset:80 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v24, off, s[12:15], 0 offset:84 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v25, off, s[12:15], 0 offset:88 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v26, off, s[12:15], 0 offset:92 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v27, off, s[12:15], 0 offset:96 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v28, off, s[12:15], 0 offset:100 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v29, off, s[12:15], 0 offset:104 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v30, off, s[12:15], 0 offset:108 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v31, off, s[12:15], 0 offset:112 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v32, off, s[12:15], 0 offset:116 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v33, off, s[12:15], 0 offset:120 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v34, off, s[12:15], 0 offset:124 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v35, off, s[12:15], 0 offset:128 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: ds_read_b128 v[32:35], v3 offset:24688
+; GCN-ILP-NEXT: ds_read_b128 v[28:31], v3 offset:24672
+; GCN-ILP-NEXT: ds_read_b128 v[24:27], v3 offset:24656
+; GCN-ILP-NEXT: ds_read_b128 v[20:23], v3 offset:24640
+; GCN-ILP-NEXT: ds_read_b128 v[16:19], v3 offset:24624
+; GCN-ILP-NEXT: ds_read_b128 v[12:15], v3 offset:24608
+; GCN-ILP-NEXT: ds_read_b128 v[8:11], v3 offset:24592
+; GCN-ILP-NEXT: ds_read_b128 v[4:7], v3 offset:24576
+; GCN-ILP-NEXT: ds_read_b128 a[60:63], v3 offset:49264
+; GCN-ILP-NEXT: ds_read_b128 a[56:59], v3 offset:49248
+; GCN-ILP-NEXT: ds_read_b128 a[52:55], v3 offset:49232
+; GCN-ILP-NEXT: ds_read_b128 a[48:51], v3 offset:49216
+; GCN-ILP-NEXT: ds_read_b128 a[44:47], v3 offset:49200
+; GCN-ILP-NEXT: ds_read_b128 a[40:43], v3 offset:49184
+; GCN-ILP-NEXT: ds_read_b128 a[36:39], v3 offset:49168
+; GCN-ILP-NEXT: ds_read_b128 a[32:35], v3 offset:49152
; GCN-ILP-NEXT: v_add_u32_e32 v3, 0x6000, v3
-; GCN-ILP-NEXT: s_nop 7
-; GCN-ILP-NEXT: s_nop 7
-; GCN-ILP-NEXT: s_nop 1
-; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:24592
-; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:57360
-; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:24576
-; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:57344
-; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:24624
+; GCN-ILP-NEXT: buffer_store_dword a1, off, s[12:15], 0 offset:136 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword a2, off, s[12:15], 0 offset:140 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword a3, off, s[12:15], 0 offset:144 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v62, a5 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v61, a6 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v60, a7 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v59, a8 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v58, a9 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v57, a10 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v55, a12 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v54, a13 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v53, a14 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v52, a15 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v51, a16 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v50, a17 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v49, a18 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v48, a19 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v47, a20 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v46, a21 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v45, a22 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v44, a23 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v43, a24 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v42, a25 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v41, a26 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v40, a27 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v39, a28 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v38, a29 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v37, a30 ; Reload Reuse
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v36, a31 ; Reload Reuse
; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:57392
-; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:24608
; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:57376
-; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:24656
-; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:57424
-; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:24640
+; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:57360
+; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:57344
; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:57408
-; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:24688
-; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:57456
-; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:24672
+; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:57424
; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:57440
+; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:57456
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
-; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
+; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GCN-ILP-NEXT: v_add_u32_e32 v0, s1, v2
+; GCN-ILP-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:264 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: s_nop 0
+; GCN-ILP-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: v_mov_b32_e32 v0, s1
+; GCN-ILP-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:268 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:272 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:276 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:280 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:284 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v10, off, s[12:15], 0 offset:288 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v11, off, s[12:15], 0 offset:292 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v12, off, s[12:15], 0 offset:296 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v13, off, s[12:15], 0 offset:300 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v14, off, s[12:15], 0 offset:304 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:308 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v16, off, s[12:15], 0 offset:312 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:316 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:320 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:324 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:328 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:332 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v22, off, s[12:15], 0 offset:336 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v23, off, s[12:15], 0 offset:340 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v24, off, s[12:15], 0 offset:344 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v25, off, s[12:15], 0 offset:348 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v26, off, s[12:15], 0 offset:352 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v27, off, s[12:15], 0 offset:356 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v28, off, s[12:15], 0 offset:360 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v29, off, s[12:15], 0 offset:364 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v30, off, s[12:15], 0 offset:368 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v31, off, s[12:15], 0 offset:372 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v35, off, s[12:15], 0 offset:388 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:260 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:132 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: v_mov_b32_e32 v4, v63
+; GCN-ILP-NEXT: buffer_store_dword v32, off, s[12:15], 0 offset:376 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:136 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:140 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:144 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: v_mov_b32_e32 v5, v62
+; GCN-ILP-NEXT: buffer_load_dword v32, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: v_mov_b32_e32 v6, v61
+; GCN-ILP-NEXT: v_mov_b32_e32 v7, v60
+; GCN-ILP-NEXT: v_mov_b32_e32 v8, v59
+; GCN-ILP-NEXT: v_mov_b32_e32 v9, v58
+; GCN-ILP-NEXT: v_mov_b32_e32 v10, v57
+; GCN-ILP-NEXT: v_mov_b32_e32 v11, v56
+; GCN-ILP-NEXT: v_mov_b32_e32 v12, v55
+; GCN-ILP-NEXT: v_mov_b32_e32 v13, v54
+; GCN-ILP-NEXT: v_mov_b32_e32 v14, v53
+; GCN-ILP-NEXT: v_mov_b32_e32 v15, v52
+; GCN-ILP-NEXT: v_mov_b32_e32 v16, v51
+; GCN-ILP-NEXT: v_mov_b32_e32 v17, v50
+; GCN-ILP-NEXT: v_mov_b32_e32 v18, v49
+; GCN-ILP-NEXT: v_mov_b32_e32 v19, v48
+; GCN-ILP-NEXT: v_mov_b32_e32 v20, v47
+; GCN-ILP-NEXT: v_mov_b32_e32 v21, v46
+; GCN-ILP-NEXT: v_mov_b32_e32 v22, v45
+; GCN-ILP-NEXT: v_mov_b32_e32 v23, v44
+; GCN-ILP-NEXT: v_mov_b32_e32 v24, v43
+; GCN-ILP-NEXT: v_mov_b32_e32 v25, v42
+; GCN-ILP-NEXT: v_mov_b32_e32 v26, v41
+; GCN-ILP-NEXT: v_mov_b32_e32 v27, v40
+; GCN-ILP-NEXT: v_mov_b32_e32 v28, v39
+; GCN-ILP-NEXT: v_mov_b32_e32 v29, v38
+; GCN-ILP-NEXT: v_mov_b32_e32 v30, v37
+; GCN-ILP-NEXT: s_waitcnt vmcnt(1)
+; GCN-ILP-NEXT: v_mov_b32_e32 v31, v36
+; GCN-ILP-NEXT: buffer_store_dword v33, off, s[12:15], 0 offset:380 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: buffer_store_dword v34, off, s[12:15], 0 offset:384 ; 4-byte Folded Spill
+; GCN-ILP-NEXT: s_waitcnt vmcnt(2)
+; GCN-ILP-NEXT: ds_write_b128 v32, v[0:3]
+; GCN-ILP-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:260 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: v_mov_b32_e32 v1, 2.0
+; GCN-ILP-NEXT: s_waitcnt vmcnt(1)
+; GCN-ILP-NEXT: ds_write_b128 v0, v[4:7] offset:16
+; GCN-ILP-NEXT: ds_write_b128 v0, v[8:11] offset:32
+; GCN-ILP-NEXT: ds_write_b128 v0, v[12:15] offset:48
+; GCN-ILP-NEXT: ds_write_b128 v0, v[16:19] offset:64
+; GCN-ILP-NEXT: ds_write_b128 v0, v[20:23] offset:80
+; GCN-ILP-NEXT: ds_write_b128 v0, v[24:27] offset:96
+; GCN-ILP-NEXT: ds_write_b128 v0, v[28:31] offset:112
+; GCN-ILP-NEXT: v_mov_b32_e32 v0, 1.0
+; GCN-ILP-NEXT: s_nop 1
+; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[32:63]
+; GCN-ILP-NEXT: s_waitcnt vmcnt(0)
+; GCN-ILP-NEXT: s_nop 7
+; GCN-ILP-NEXT: s_nop 7
+; GCN-ILP-NEXT: s_nop 1
+; GCN-ILP-NEXT: ds_write_b128 v2, a[56:59] offset:24672
+; GCN-ILP-NEXT: ds_write_b128 v2, a[60:63] offset:24688
+; GCN-ILP-NEXT: ds_write_b128 v2, a[48:51] offset:24640
+; GCN-ILP-NEXT: ds_write_b128 v2, a[52:55] offset:24656
+; GCN-ILP-NEXT: ds_write_b128 v2, a[40:43] offset:24608
+; GCN-ILP-NEXT: ds_write_b128 v2, a[44:47] offset:24624
+; GCN-ILP-NEXT: ds_write_b128 v2, a[32:35] offset:24576
+; GCN-ILP-NEXT: ds_write_b128 v2, a[36:39] offset:24592
+; GCN-ILP-NEXT: buffer_load_dword a32, off, s[12:15], 0 offset:264 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a33, off, s[12:15], 0 offset:268 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a34, off, s[12:15], 0 offset:272 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a35, off, s[12:15], 0 offset:276 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a36, off, s[12:15], 0 offset:280 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a37, off, s[12:15], 0 offset:284 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a38, off, s[12:15], 0 offset:288 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a39, off, s[12:15], 0 offset:292 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a40, off, s[12:15], 0 offset:296 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a41, off, s[12:15], 0 offset:300 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a42, off, s[12:15], 0 offset:304 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a43, off, s[12:15], 0 offset:308 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a44, off, s[12:15], 0 offset:312 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a45, off, s[12:15], 0 offset:316 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a46, off, s[12:15], 0 offset:320 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a47, off, s[12:15], 0 offset:324 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a48, off, s[12:15], 0 offset:328 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a49, off, s[12:15], 0 offset:332 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a50, off, s[12:15], 0 offset:336 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a51, off, s[12:15], 0 offset:340 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a52, off, s[12:15], 0 offset:344 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a53, off, s[12:15], 0 offset:348 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a54, off, s[12:15], 0 offset:352 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a55, off, s[12:15], 0 offset:356 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a56, off, s[12:15], 0 offset:360 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a57, off, s[12:15], 0 offset:364 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a58, off, s[12:15], 0 offset:368 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a59, off, s[12:15], 0 offset:372 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a60, off, s[12:15], 0 offset:376 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a61, off, s[12:15], 0 offset:380 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a62, off, s[12:15], 0 offset:384 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a63, off, s[12:15], 0 offset:388 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: s_waitcnt vmcnt(0)
+; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[32:63]
+; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:32864
+; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:32880
+; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:32832
+; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:32848
+; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:32800
+; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:32816
+; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:32768
+; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:32784
+; GCN-ILP-NEXT: buffer_load_dword a0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a3, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a4, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a5, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a6, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a7, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a8, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a9, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a10, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a11, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a12, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a13, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a14, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a15, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a16, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a17, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a18, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a19, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a20, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a21, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a22, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a23, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a24, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a25, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a26, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a27, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a28, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a29, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a30, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: buffer_load_dword a31, off, s[12:15], 0 offset:128 ; 4-byte Folded Reload
+; GCN-ILP-NEXT: s_waitcnt vmcnt(0)
+; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-ILP-NEXT: s_nop 7
; GCN-ILP-NEXT: s_nop 7
; GCN-ILP-NEXT: s_nop 2
-; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:32864
-; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:32880
-; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:32832
-; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:32848
-; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:32800
-; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:32816
-; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:32768
-; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:32784
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v35, a31
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v34, a30
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v33, a29
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v32, a28
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v31, a27
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v30, a26
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v29, a25
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v28, a24
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v27, a23
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v26, a22
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v25, a21
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v24, a20
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v23, a19
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v22, a18
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v21, a17
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v20, a16
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v19, a15
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v18, a14
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v17, a13
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v16, a12
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v15, a11
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v14, a10
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v13, a9
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v12, a8
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v11, a7
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v10, a6
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v9, a5
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v8, a4
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v7, a3
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v6, a2
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v5, a1
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v4, a0
+; GCN-ILP-NEXT: ds_write_b128 v2, v[28:31] offset:8288
+; GCN-ILP-NEXT: ds_write_b128 v2, v[32:35] offset:8304
+; GCN-ILP-NEXT: ds_write_b128 v2, v[20:23] offset:8256
+; GCN-ILP-NEXT: ds_write_b128 v2, v[24:27] offset:8272
+; GCN-ILP-NEXT: ds_write_b128 v2, v[12:15] offset:8224
+; GCN-ILP-NEXT: ds_write_b128 v2, v[16:19] offset:8240
+; GCN-ILP-NEXT: ds_write_b128 v2, v[4:7] offset:8192
+; GCN-ILP-NEXT: ds_write_b128 v2, v[8:11] offset:8208
+; GCN-ILP-NEXT: ds_write_b128 v2, a[56:59] offset:16480
+; GCN-ILP-NEXT: ds_write_b128 v2, a[60:63] offset:16496
+; GCN-ILP-NEXT: ds_write_b128 v2, a[48:51] offset:16448
+; GCN-ILP-NEXT: ds_write_b128 v2, a[52:55] offset:16464
+; GCN-ILP-NEXT: ds_write_b128 v2, a[40:43] offset:16416
+; GCN-ILP-NEXT: ds_write_b128 v2, a[44:47] offset:16432
+; GCN-ILP-NEXT: ds_write_b128 v2, a[32:35] offset:16384
+; GCN-ILP-NEXT: ds_write_b128 v2, a[36:39] offset:16400
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
+; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-ILP-NEXT: s_endpgm
entry:
@@ -485,12 +1413,20 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:48
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GCN-MINREG-NEXT: ds_read_b128 a[60:63], v3 offset:8304
+; GCN-MINREG-NEXT: ds_read_b128 a[56:59], v3 offset:8288
+; GCN-MINREG-NEXT: ds_read_b128 a[52:55], v3 offset:8272
+; GCN-MINREG-NEXT: ds_read_b128 a[48:51], v3 offset:8256
+; GCN-MINREG-NEXT: ds_read_b128 a[44:47], v3 offset:8240
+; GCN-MINREG-NEXT: ds_read_b128 a[40:43], v3 offset:8224
+; GCN-MINREG-NEXT: ds_read_b128 a[36:39], v3 offset:8208
+; GCN-MINREG-NEXT: ds_read_b128 a[32:35], v3 offset:8192
; GCN-MINREG-NEXT: v_add_u32_e32 v2, s1, v2
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v0, a[32:63]
; GCN-MINREG-NEXT: s_nop 7
-; GCN-MINREG-NEXT: s_nop 7
-; GCN-MINREG-NEXT: s_nop 1
; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:112
; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:96
; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:80
@@ -499,31 +1435,19 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:32
; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:16
; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3]
-; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:8304
-; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:8288
-; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:8272
-; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:8256
-; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:8240
-; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:8224
-; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:8208
-; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:8192
-; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
; GCN-MINREG-NEXT: v_mov_b32_e32 v2, s1
+; GCN-MINREG-NEXT: s_nop 1
+; GCN-MINREG-NEXT: ds_write_b128 v2, a[56:59] offset:8288
+; GCN-MINREG-NEXT: ds_write_b128 v2, a[60:63] offset:8304
+; GCN-MINREG-NEXT: ds_write_b128 v2, a[48:51] offset:8256
+; GCN-MINREG-NEXT: ds_write_b128 v2, a[52:55] offset:8272
+; GCN-MINREG-NEXT: ds_write_b128 v2, a[40:43] offset:8224
+; GCN-MINREG-NEXT: ds_write_b128 v2, a[44:47] offset:8240
+; GCN-MINREG-NEXT: ds_write_b128 v2, a[32:35] offset:8192
+; GCN-MINREG-NEXT: ds_write_b128 v2, a[36:39] offset:8208
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT: s_nop 7
-; GCN-MINREG-NEXT: s_nop 7
-; GCN-MINREG-NEXT: s_nop 1
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:8288
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:8304
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:8256
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:8272
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:8224
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:8240
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:8192
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:8208
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_barrier mask(0x00000000)
; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:24688
@@ -536,44 +1460,54 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:24624
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GCN-MINREG-NEXT: ds_read_b128 a[60:63], v3 offset:49264
+; GCN-MINREG-NEXT: ds_read_b128 a[56:59], v3 offset:49248
+; GCN-MINREG-NEXT: ds_read_b128 a[52:55], v3 offset:49232
+; GCN-MINREG-NEXT: ds_read_b128 a[48:51], v3 offset:49216
+; GCN-MINREG-NEXT: ds_read_b128 a[44:47], v3 offset:49200
+; GCN-MINREG-NEXT: ds_read_b128 a[40:43], v3 offset:49184
+; GCN-MINREG-NEXT: ds_read_b128 a[36:39], v3 offset:49168
+; GCN-MINREG-NEXT: ds_read_b128 a[32:35], v3 offset:49152
; GCN-MINREG-NEXT: v_add_u32_e32 v4, 0x6000, v3
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT: s_nop 7
-; GCN-MINREG-NEXT: s_nop 7
-; GCN-MINREG-NEXT: s_nop 1
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:16496
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:16480
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:16464
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:16448
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:16432
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:16416
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:16400
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:16384
-; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:49264
-; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:49248
-; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:49232
-; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:49216
-; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:49200
-; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:49184
-; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:49168
-; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:49152
-; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
-; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT: s_nop 7
+; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v0, a[32:63]
; GCN-MINREG-NEXT: s_nop 7
-; GCN-MINREG-NEXT: s_nop 2
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:24688
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:24672
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:24656
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:24640
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:24624
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:24608
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:24592
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:24576
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v37, a31
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v36, a30
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v35, a29
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v34, a28
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v33, a27
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v32, a26
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v31, a25
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v30, a24
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v29, a23
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v28, a22
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v27, a21
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v26, a20
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v25, a19
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v24, a18
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v23, a17
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v22, a16
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v21, a15
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v20, a14
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v19, a13
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v18, a12
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v17, a11
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v16, a10
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v15, a9
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v14, a8
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v13, a7
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v12, a6
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v11, a5
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v10, a4
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v9, a3
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v8, a2
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v7, a1
+; GCN-MINREG-NEXT: v_accvgpr_read_b32 v6, a0
; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:57456
; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:57440
; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:57424
@@ -582,12 +1516,28 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:57360
; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:57376
; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:57392
-; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-MINREG-NEXT: ds_write_b128 v2, a[60:63] offset:24688
+; GCN-MINREG-NEXT: ds_write_b128 v2, a[56:59] offset:24672
+; GCN-MINREG-NEXT: ds_write_b128 v2, a[52:55] offset:24656
+; GCN-MINREG-NEXT: ds_write_b128 v2, a[48:51] offset:24640
+; GCN-MINREG-NEXT: ds_write_b128 v2, a[44:47] offset:24624
+; GCN-MINREG-NEXT: ds_write_b128 v2, a[40:43] offset:24608
+; GCN-MINREG-NEXT: ds_write_b128 v2, a[36:39] offset:24592
+; GCN-MINREG-NEXT: ds_write_b128 v2, a[32:35] offset:24576
+; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(8)
; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT: s_nop 7
+; GCN-MINREG-NEXT: ds_write_b128 v2, v[34:37] offset:16496
+; GCN-MINREG-NEXT: ds_write_b128 v2, v[30:33] offset:16480
+; GCN-MINREG-NEXT: ds_write_b128 v2, v[26:29] offset:16464
+; GCN-MINREG-NEXT: ds_write_b128 v2, v[22:25] offset:16448
+; GCN-MINREG-NEXT: ds_write_b128 v2, v[18:21] offset:16432
+; GCN-MINREG-NEXT: ds_write_b128 v2, v[14:17] offset:16416
+; GCN-MINREG-NEXT: ds_write_b128 v2, v[10:13] offset:16400
+; GCN-MINREG-NEXT: ds_write_b128 v2, v[6:9] offset:16384
; GCN-MINREG-NEXT: s_nop 7
; GCN-MINREG-NEXT: s_nop 2
; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:32880
@@ -605,134 +1555,210 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-MAXOCC: ; %bb.0: ; %entry
; GCN-MAXOCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-MAXOCC-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v3, 7, v0
-; GCN-MAXOCC-NEXT: v_mov_b32_e32 v1, 1.0
-; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 2.0
+; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v2, 7, v0
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v0, 1.0
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v1, 2.0
; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, s0, v3
-; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:112
-; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:96
-; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:80
-; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:64
-; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0
-; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:16
-; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:32
-; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:48
+; GCN-MAXOCC-NEXT: v_add_u32_e32 v3, s0, v2
+; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v3 offset:112
+; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v3 offset:96
+; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v3 offset:80
+; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v3 offset:64
+; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v3
+; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v3 offset:16
+; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v3 offset:32
+; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v3 offset:48
; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-MAXOCC-NEXT: v_add_u32_e32 v3, s1, v3
+; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GCN-MAXOCC-NEXT: ds_read_b128 a[60:63], v3 offset:8304
+; GCN-MAXOCC-NEXT: ds_read_b128 a[56:59], v3 offset:8288
+; GCN-MAXOCC-NEXT: ds_read_b128 a[52:55], v3 offset:8272
+; GCN-MAXOCC-NEXT: ds_read_b128 a[48:51], v3 offset:8256
+; GCN-MAXOCC-NEXT: ds_read_b128 a[44:47], v3 offset:8240
+; GCN-MAXOCC-NEXT: ds_read_b128 a[40:43], v3 offset:8224
+; GCN-MAXOCC-NEXT: ds_read_b128 a[36:39], v3 offset:8208
+; GCN-MAXOCC-NEXT: ds_read_b128 a[32:35], v3 offset:8192
+; GCN-MAXOCC-NEXT: v_add_u32_e32 v2, s1, v2
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[32:63]
; GCN-MAXOCC-NEXT: s_nop 7
-; GCN-MAXOCC-NEXT: s_nop 7
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[28:31] offset:112
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[24:27] offset:96
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[20:23] offset:80
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[16:19] offset:64
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[12:15] offset:48
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[8:11] offset:32
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[4:7] offset:16
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[0:3]
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, s1
; GCN-MAXOCC-NEXT: s_nop 1
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:112
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:96
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:80
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:64
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:48
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:32
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:16
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3]
-; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:8304
-; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:8288
-; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:8272
-; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:8256
-; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:8240
-; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:8224
-; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:8208
-; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:8192
-; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-MAXOCC-NEXT: v_mov_b32_e32 v3, s1
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[56:59] offset:8288
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[60:63] offset:8304
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[48:51] offset:8256
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[52:55] offset:8272
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[40:43] offset:8224
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[44:47] offset:8240
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[32:35] offset:8192
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[36:39] offset:8208
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT: s_nop 7
-; GCN-MAXOCC-NEXT: s_nop 7
-; GCN-MAXOCC-NEXT: s_nop 1
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:8288
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:8304
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:8256
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:8272
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:8224
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:8240
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:8192
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:8208
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_barrier mask(0x00000000)
-; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:24688
-; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:24672
-; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:24656
-; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:24640
-; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:24576
-; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:24592
-; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:24608
-; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:24624
+; GCN-MAXOCC-NEXT: ds_read_b128 v[4:7], v3 offset:24576
+; GCN-MAXOCC-NEXT: ds_read_b128 v[32:35], v3 offset:24688
+; GCN-MAXOCC-NEXT: ds_read_b128 v[28:31], v3 offset:24672
+; GCN-MAXOCC-NEXT: ds_read_b128 v[24:27], v3 offset:24656
+; GCN-MAXOCC-NEXT: ds_read_b128 v[20:23], v3 offset:24640
+; GCN-MAXOCC-NEXT: ds_read_b128 v[8:11], v3 offset:24592
+; GCN-MAXOCC-NEXT: ds_read_b128 v[12:15], v3 offset:24608
+; GCN-MAXOCC-NEXT: ds_read_b128 v[16:19], v3 offset:24624
; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a0, v4
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a1, v5
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a2, v6
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a3, v7
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a4, v8
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a5, v9
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a6, v10
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a7, v11
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a8, v12
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a9, v13
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a10, v14
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a11, v15
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a12, v16
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a13, v17
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a14, v18
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a15, v19
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a16, v20
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a17, v21
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a18, v22
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a19, v23
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a20, v24
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a21, v25
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a22, v26
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a23, v27
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a24, v28
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a25, v29
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a26, v30
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a27, v31
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a28, v32
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a29, v33
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a30, v34
+; GCN-MAXOCC-NEXT: v_accvgpr_write_b32 a31, v35
+; GCN-MAXOCC-NEXT: v_add_u32_e32 v4, 0x6000, v3
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
+; GCN-MAXOCC-NEXT: s_nop 0
+; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GCN-MAXOCC-NEXT: ds_read_b128 a[60:63], v3 offset:49264
+; GCN-MAXOCC-NEXT: ds_read_b128 a[56:59], v3 offset:49248
+; GCN-MAXOCC-NEXT: ds_read_b128 a[52:55], v3 offset:49232
+; GCN-MAXOCC-NEXT: ds_read_b128 a[48:51], v3 offset:49216
+; GCN-MAXOCC-NEXT: ds_read_b128 a[44:47], v3 offset:49200
+; GCN-MAXOCC-NEXT: ds_read_b128 a[40:43], v3 offset:49184
+; GCN-MAXOCC-NEXT: ds_read_b128 a[36:39], v3 offset:49168
+; GCN-MAXOCC-NEXT: ds_read_b128 a[32:35], v3 offset:49152
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 7
-; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 2
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:16496
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:16480
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:16464
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:16448
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:16432
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:16416
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:16400
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:16384
-; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:49264
-; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:49248
-; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:49232
-; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:49216
-; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:49200
-; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:49184
-; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:49168
-; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:49152
-; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, 0x6000, v0
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v37, a31
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v35, a29
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v34, a28
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v33, a27
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v32, a26
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v31, a25
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v30, a24
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v29, a23
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v28, a22
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v27, a21
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v26, a20
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v25, a19
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v24, a18
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v23, a17
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v22, a16
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v21, a15
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v20, a14
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v19, a13
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v18, a12
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v17, a11
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v16, a10
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v15, a9
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v14, a8
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v13, a7
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v12, a6
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v11, a5
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v10, a4
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v9, a3
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v8, a2
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v7, a1
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v6, a0
+; GCN-MAXOCC-NEXT: v_accvgpr_read_b32 v36, a30
+; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v4 offset:57456
+; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v4 offset:57440
+; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v4 offset:57424
+; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v4 offset:57408
+; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v4 offset:57344
+; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v4 offset:57360
+; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v4 offset:57376
+; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v4 offset:57392
+; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
+; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
+; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[8:9], v[10:11], v[10:11] op_sel:[0,1]
+; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[10:11], v[12:13], v[12:13] op_sel:[0,1]
+; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[12:13], v[14:15], v[14:15] op_sel:[0,1]
+; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[14:15], v[16:17], v[16:17] op_sel:[0,1]
+; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[16:17], v[18:19], v[18:19] op_sel:[0,1]
+; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[18:19], v[20:21], v[20:21] op_sel:[0,1]
+; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[20:21], v[22:23], v[22:23] op_sel:[0,1]
+; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[22:23], v[24:25], v[24:25] op_sel:[0,1]
+; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[24:25], v[26:27], v[26:27] op_sel:[0,1]
+; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[26:27], v[28:29], v[28:29] op_sel:[0,1]
+; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[28:29], v[30:31], v[30:31] op_sel:[0,1]
+; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[30:31], v[32:33], v[32:33] op_sel:[0,1]
+; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[32:33], v[34:35], v[34:35] op_sel:[0,1]
+; GCN-MAXOCC-NEXT: v_pk_mov_b32 v[34:35], v[36:37], v[36:37] op_sel:[0,1]
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, v[32:35] offset:16496
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, v[28:31] offset:16480
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, v[24:27] offset:16464
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, v[20:23] offset:16448
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, v[16:19] offset:16432
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, v[12:15] offset:16416
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, v[8:11] offset:16400
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, v[4:7] offset:16384
+; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(14)
+; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[32:63]
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 7
-; GCN-MAXOCC-NEXT: s_nop 1
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:24688
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:24672
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:24656
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:24640
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:24624
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:24608
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:24592
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:24576
-; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:57456
-; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:57440
-; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:57424
-; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:57408
-; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:57344
-; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:57360
-; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:57376
-; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:57392
-; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; GCN-MAXOCC-NEXT: s_nop 2
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[60:63] offset:24688
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[56:59] offset:24672
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[52:55] offset:24656
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[48:51] offset:24640
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[44:47] offset:24624
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[40:43] offset:24608
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[36:39] offset:24592
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[32:35] offset:24576
+; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(14)
+; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 7
; GCN-MAXOCC-NEXT: s_nop 2
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:32880
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:32864
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:32848
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:32832
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:32816
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:32800
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:32784
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:32768
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[28:31] offset:32880
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[24:27] offset:32864
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[20:23] offset:32848
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[16:19] offset:32832
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[12:15] offset:32816
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[8:11] offset:32800
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[4:7] offset:32784
+; GCN-MAXOCC-NEXT: ds_write_b128 v2, a[0:3] offset:32768
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: s_endpgm
;
@@ -745,47 +1771,40 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-ILP-NEXT: v_mov_b32_e32 v1, 2.0
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
; GCN-ILP-NEXT: v_add_u32_e32 v3, s0, v2
-; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:48
-; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:32
-; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:16
-; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3
-; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:64
-; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:80
-; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:96
-; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:112
+; GCN-ILP-NEXT: ds_read_b128 a[44:47], v3 offset:48
+; GCN-ILP-NEXT: ds_read_b128 a[40:43], v3 offset:32
+; GCN-ILP-NEXT: ds_read_b128 a[36:39], v3 offset:16
+; GCN-ILP-NEXT: ds_read_b128 a[32:35], v3
+; GCN-ILP-NEXT: ds_read_b128 a[48:51], v3 offset:64
+; GCN-ILP-NEXT: ds_read_b128 a[52:55], v3 offset:80
+; GCN-ILP-NEXT: ds_read_b128 a[56:59], v3 offset:96
+; GCN-ILP-NEXT: ds_read_b128 a[60:63], v3 offset:112
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GCN-ILP-NEXT: v_add_u32_e32 v2, s1, v2
-; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
-; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-ILP-NEXT: s_nop 7
-; GCN-ILP-NEXT: s_nop 7
-; GCN-ILP-NEXT: s_nop 1
-; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3]
+; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[32:63]
; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:8192
-; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:16
; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:8208
-; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:32
; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:8224
-; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:48
; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:8240
-; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:64
; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:8256
-; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:80
; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:8272
-; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:96
; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:8288
-; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:112
; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:8304
-; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GCN-ILP-NEXT: v_add_u32_e32 v4, s1, v2
; GCN-ILP-NEXT: v_mov_b32_e32 v2, s1
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
-; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-ILP-NEXT: s_nop 7
-; GCN-ILP-NEXT: s_nop 7
-; GCN-ILP-NEXT: s_nop 1
+; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GCN-ILP-NEXT: s_nop 6
+; GCN-ILP-NEXT: ds_write_b128 v4, a[32:35]
+; GCN-ILP-NEXT: ds_write_b128 v4, a[36:39] offset:16
+; GCN-ILP-NEXT: ds_write_b128 v4, a[40:43] offset:32
+; GCN-ILP-NEXT: ds_write_b128 v4, a[44:47] offset:48
+; GCN-ILP-NEXT: ds_write_b128 v4, a[48:51] offset:64
+; GCN-ILP-NEXT: ds_write_b128 v4, a[52:55] offset:80
+; GCN-ILP-NEXT: ds_write_b128 v4, a[56:59] offset:96
+; GCN-ILP-NEXT: ds_write_b128 v4, a[60:63] offset:112
+; GCN-ILP-NEXT: s_nop 3
; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:8288
; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:8304
; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:8256
@@ -795,6 +1814,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:8192
; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:8208
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
+; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
+; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_barrier mask(0x00000000)
; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:24624
; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:24608
@@ -806,53 +1828,82 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:24688
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GCN-ILP-NEXT: ds_read_b128 a[60:63], v3 offset:49264
+; GCN-ILP-NEXT: ds_read_b128 a[56:59], v3 offset:49248
+; GCN-ILP-NEXT: ds_read_b128 a[52:55], v3 offset:49232
+; GCN-ILP-NEXT: ds_read_b128 a[48:51], v3 offset:49216
+; GCN-ILP-NEXT: ds_read_b128 a[44:47], v3 offset:49200
+; GCN-ILP-NEXT: ds_read_b128 a[40:43], v3 offset:49184
+; GCN-ILP-NEXT: ds_read_b128 a[36:39], v3 offset:49168
+; GCN-ILP-NEXT: ds_read_b128 a[32:35], v3 offset:49152
+; GCN-ILP-NEXT: v_add_u32_e32 v3, 0x6000, v3
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-ILP-NEXT: s_nop 7
-; GCN-ILP-NEXT: s_nop 7
-; GCN-ILP-NEXT: s_nop 2
-; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:16496
-; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:16480
-; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:16464
-; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:16448
-; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:16432
-; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:16416
-; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:16400
-; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:16384
-; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:49152
-; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:49168
-; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:49184
-; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:49200
-; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:49216
-; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:49232
-; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:49248
-; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:49264
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GCN-ILP-NEXT: v_add_u32_e32 v3, 0x6000, v3
-; GCN-ILP-NEXT: s_nop 7
+; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[32:63]
; GCN-ILP-NEXT: s_nop 7
-; GCN-ILP-NEXT: s_nop 1
-; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:24576
-; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:57344
-; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:24592
-; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:57360
-; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:24608
-; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:57376
-; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:24624
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v35, a31
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v34, a30
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v33, a29
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v32, a28
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v31, a27
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v30, a26
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v29, a25
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v28, a24
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v27, a23
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v26, a22
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v25, a21
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v24, a20
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v23, a19
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v22, a18
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v21, a17
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v20, a16
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v19, a15
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v18, a14
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v17, a13
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v16, a12
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v15, a11
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v14, a10
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v13, a9
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v12, a8
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v11, a7
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v10, a6
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v9, a5
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v8, a4
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v7, a3
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v6, a2
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v5, a1
+; GCN-ILP-NEXT: v_accvgpr_read_b32 v4, a0
; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:57392
-; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:24640
+; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:57376
+; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:57360
+; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:57344
; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:57408
-; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:24656
; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:57424
-; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:24672
; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:57440
-; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:24688
; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:57456
-; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-ILP-NEXT: ds_write_b128 v2, a[60:63] offset:24688
+; GCN-ILP-NEXT: ds_write_b128 v2, a[56:59] offset:24672
+; GCN-ILP-NEXT: ds_write_b128 v2, a[52:55] offset:24656
+; GCN-ILP-NEXT: ds_write_b128 v2, a[48:51] offset:24640
+; GCN-ILP-NEXT: ds_write_b128 v2, a[44:47] offset:24624
+; GCN-ILP-NEXT: ds_write_b128 v2, a[40:43] offset:24608
+; GCN-ILP-NEXT: ds_write_b128 v2, a[36:39] offset:24592
+; GCN-ILP-NEXT: ds_write_b128 v2, a[32:35] offset:24576
+; GCN-ILP-NEXT: ds_write_b128 v2, v[32:35] offset:16496
+; GCN-ILP-NEXT: ds_write_b128 v2, v[28:31] offset:16480
+; GCN-ILP-NEXT: ds_write_b128 v2, v[24:27] offset:16464
+; GCN-ILP-NEXT: ds_write_b128 v2, v[20:23] offset:16448
+; GCN-ILP-NEXT: ds_write_b128 v2, v[16:19] offset:16432
+; GCN-ILP-NEXT: ds_write_b128 v2, v[12:15] offset:16416
+; GCN-ILP-NEXT: ds_write_b128 v2, v[8:11] offset:16400
+; GCN-ILP-NEXT: ds_write_b128 v2, v[4:7] offset:16384
+; GCN-ILP-NEXT: s_waitcnt lgkmcnt(14)
; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
+; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-ILP-NEXT: s_nop 7
@@ -867,8 +1918,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:32784
; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:32768
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
-; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-ILP-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
index 73586b1243376..266df5d56b5c2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
@@ -387,88 +387,87 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v16, 7, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v32, 7, v0
; GCN-NEXT: ; kill: killed $sgpr0_sgpr1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32
+; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:16
+; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:112
+; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96
; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_lo_u32 v13, v13, v13
-; GCN-NEXT: v_mul_lo_u32 v12, v12, v12
-; GCN-NEXT: v_mul_lo_u32 v15, v15, v15
-; GCN-NEXT: v_mul_lo_u32 v14, v14, v14
-; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32
-; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
-; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
-; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_lo_u32 v7, v7, v7
+; GCN-NEXT: v_mul_lo_u32 v6, v6, v6
+; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:80
+; GCN-NEXT: v_mul_lo_u32 v5, v5, v5
+; GCN-NEXT: v_mul_lo_u32 v4, v4, v4
+; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:48
+; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32
+; GCN-NEXT: s_waitcnt vmcnt(4)
+; GCN-NEXT: v_mul_lo_u32 v11, v11, v11
+; GCN-NEXT: v_mul_lo_u32 v10, v10, v10
+; GCN-NEXT: v_mul_lo_u32 v9, v9, v9
+; GCN-NEXT: v_mul_lo_u32 v8, v8, v8
+; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:112
; GCN-NEXT: v_mul_lo_u32 v3, v3, v3
; GCN-NEXT: v_mul_lo_u32 v2, v2, v2
+; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:96
; GCN-NEXT: v_mul_lo_u32 v1, v1, v1
; GCN-NEXT: v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
-; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112
+; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_lo_u32 v3, v3, v3
-; GCN-NEXT: v_mul_lo_u32 v2, v2, v2
-; GCN-NEXT: v_mul_lo_u32 v1, v1, v1
-; GCN-NEXT: v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:112
-; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:96
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_lo_u32 v3, v3, v3
-; GCN-NEXT: v_mul_lo_u32 v2, v2, v2
-; GCN-NEXT: v_mul_lo_u32 v1, v1, v1
-; GCN-NEXT: v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96
-; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:80
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_lo_u32 v3, v3, v3
-; GCN-NEXT: v_mul_lo_u32 v2, v2, v2
-; GCN-NEXT: v_mul_lo_u32 v1, v1, v1
-; GCN-NEXT: v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:80
-; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48
; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: v_mul_lo_u32 v15, v15, v15
+; GCN-NEXT: v_mul_lo_u32 v14, v14, v14
+; GCN-NEXT: v_mul_lo_u32 v13, v13, v13
+; GCN-NEXT: s_waitcnt vmcnt(4)
+; GCN-NEXT: v_mul_lo_u32 v19, v19, v19
+; GCN-NEXT: v_mul_lo_u32 v18, v18, v18
+; GCN-NEXT: v_mul_lo_u32 v17, v17, v17
+; GCN-NEXT: v_mul_lo_u32 v16, v16, v16
+; GCN-NEXT: v_mul_lo_u32 v12, v12, v12
+; GCN-NEXT: s_waitcnt vmcnt(3)
+; GCN-NEXT: v_mul_lo_u32 v23, v23, v23
+; GCN-NEXT: v_mul_lo_u32 v22, v22, v22
+; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:80
+; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64
+; GCN-NEXT: v_mul_lo_u32 v21, v21, v21
+; GCN-NEXT: v_mul_lo_u32 v20, v20, v20
+; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:48
+; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3]
; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_lo_u32 v7, v7, v7
-; GCN-NEXT: v_mul_lo_u32 v6, v6, v6
-; GCN-NEXT: v_mul_lo_u32 v5, v5, v5
-; GCN-NEXT: v_mul_lo_u32 v4, v4, v4
-; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48
-; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16
; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_lo_u32 v9, v9, v9
-; GCN-NEXT: v_mul_lo_u32 v8, v8, v8
-; GCN-NEXT: v_mul_lo_u32 v11, v11, v11
-; GCN-NEXT: v_mul_lo_u32 v10, v10, v10
-; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16
-; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:64
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mul_lo_u32 v3, v31, v31
+; GCN-NEXT: v_mul_lo_u32 v2, v30, v30
+; GCN-NEXT: v_mul_lo_u32 v1, v29, v29
+; GCN-NEXT: v_mul_lo_u32 v0, v28, v28
+; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] offset:16
+; GCN-NEXT: s_waitcnt vmcnt(7)
+; GCN-NEXT: v_mul_lo_u32 v1, v25, v25
+; GCN-NEXT: v_mul_lo_u32 v0, v24, v24
+; GCN-NEXT: v_mul_lo_u32 v3, v27, v27
+; GCN-NEXT: v_mul_lo_u32 v2, v26, v26
+; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] offset:32
; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_lo_u32 v11, v11, v11
-; GCN-NEXT: v_mul_lo_u32 v10, v10, v10
-; GCN-NEXT: v_mul_lo_u32 v9, v9, v9
-; GCN-NEXT: v_mul_lo_u32 v8, v8, v8
-; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:64
; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
; GCN-NEXT: s_endpgm
;
@@ -476,88 +475,87 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
; EXACTCUTOFF: ; %bb.0:
; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; EXACTCUTOFF-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 7, v0
+; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v32, 7, v0
; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32
+; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:16
+; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:112
+; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0)
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14
-; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32
-; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
-; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0)
+; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1)
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v7, v7, v7
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6
+; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:80
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4
+; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:48
+; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32
+; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(4)
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8
+; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:112
; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3
; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2
+; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:96
; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1
; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3]
-; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0)
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:112
-; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:96
-; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0)
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96
-; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:80
-; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0)
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:80
-; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
+; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(5)
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13
+; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(4)
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v19, v19, v19
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v18, v18, v18
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v17, v17, v17
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v16, v16, v16
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12
+; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(3)
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v23, v23, v23
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v22, v22, v22
+; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:80
+; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v21, v21, v21
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v20, v20, v20
+; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:48
+; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3]
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0)
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v7, v7, v7
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4
-; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48
-; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0)
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10
-; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16
-; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:64
+; EXACTCUTOFF-NEXT: s_nop 0
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v31, v31
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v30, v30
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v29, v29
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v28, v28
+; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] offset:16
+; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(7)
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v25, v25
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v24, v24
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v27, v27
+; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v26, v26
+; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] offset:32
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0)
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9
-; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8
-; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:64
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #2
@@ -887,12 +885,44 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:8304
+; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:8288
+; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:8272
+; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:8256
+; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:8240
+; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:8224
+; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:8208
+; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:8192
+; GCN-NEXT: v_add_u32_e32 v4, 0x6000, v1
; GCN-NEXT: v_add_u32_e32 v0, s1, v0
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
+; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688
+; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672
+; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656
+; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640
+; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:24624
+; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:24608
+; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592
+; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576
+; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:49264
+; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:49248
+; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:49232
+; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:49216
+; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:49200
+; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:49184
+; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:49168
+; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:49152
+; GCN-NEXT: ds_read_b128 a[156:159], v4 offset:57456
+; GCN-NEXT: ds_read_b128 a[152:155], v4 offset:57440
+; GCN-NEXT: ds_read_b128 a[148:151], v4 offset:57424
+; GCN-NEXT: ds_read_b128 a[144:147], v4 offset:57408
+; GCN-NEXT: ds_read_b128 a[128:131], v4 offset:57344
+; GCN-NEXT: ds_read_b128 a[132:135], v4 offset:57360
+; GCN-NEXT: ds_read_b128 a[136:139], v4 offset:57376
+; GCN-NEXT: ds_read_b128 a[140:143], v4 offset:57392
; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112
; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96
; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80
@@ -901,104 +931,64 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32
; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16
; GCN-NEXT: ds_write_b128 v0, a[0:3]
-; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304
-; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288
-; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272
-; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256
-; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:8240
-; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224
-; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208
-; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
; GCN-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:8288
+; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:8304
+; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:8256
+; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:8272
+; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:8224
+; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:8240
+; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:8192
+; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:8208
+; GCN-NEXT: s_waitcnt lgkmcnt(14)
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95]
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:8288
-; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:8304
-; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:8256
-; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:8272
-; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:8224
-; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:8240
-; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:8192
-; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:8208
-; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:24688
-; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:24672
-; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:24656
-; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:24640
-; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:24624
-; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:24608
-; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:24592
-; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:24576
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GCN-NEXT: s_nop 2
+; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480
+; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496
+; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448
+; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464
+; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416
+; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432
+; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384
+; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127]
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 2
-; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:16480
-; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:16496
-; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:16448
-; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:16464
-; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:16416
-; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:16432
-; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:16384
-; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16400
-; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:49264
-; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:49248
-; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:49232
-; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:49216
-; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:49200
-; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:49184
-; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:49168
-; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:49152
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v1
+; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:24672
+; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:24688
+; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:24640
+; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:24656
+; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:24608
+; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:24624
+; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:24576
+; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:24592
+; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159]
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:24672
-; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:24688
-; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:24640
-; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:24656
-; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:24608
-; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:24624
-; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:24576
-; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:24592
-; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:57456
-; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:57440
-; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:57424
-; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:57408
-; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:57344
-; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:57360
-; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:57376
-; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:57392
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GCN-NEXT: s_nop 2
+; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:32864
+; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:32880
+; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:32832
+; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:32848
+; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32800
+; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:32816
+; GCN-NEXT: ds_write_b128 v0, a[128:131] offset:32768
+; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:32784
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: s_nop 2
-; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864
-; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880
-; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832
-; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848
-; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800
-; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816
-; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768
-; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-NEXT: s_endpgm
;
@@ -1021,12 +1011,44 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:48
; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; EXACTCUTOFF-NEXT: ds_read_b128 a[60:63], v1 offset:8304
+; EXACTCUTOFF-NEXT: ds_read_b128 a[56:59], v1 offset:8288
+; EXACTCUTOFF-NEXT: ds_read_b128 a[52:55], v1 offset:8272
+; EXACTCUTOFF-NEXT: ds_read_b128 a[48:51], v1 offset:8256
+; EXACTCUTOFF-NEXT: ds_read_b128 a[44:47], v1 offset:8240
+; EXACTCUTOFF-NEXT: ds_read_b128 a[40:43], v1 offset:8224
+; EXACTCUTOFF-NEXT: ds_read_b128 a[36:39], v1 offset:8208
+; EXACTCUTOFF-NEXT: ds_read_b128 a[32:35], v1 offset:8192
+; EXACTCUTOFF-NEXT: v_add_u32_e32 v4, 0x6000, v1
; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT: s_nop 7
-; EXACTCUTOFF-NEXT: s_nop 7
-; EXACTCUTOFF-NEXT: s_nop 1
+; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
+; EXACTCUTOFF-NEXT: ds_read_b128 a[92:95], v1 offset:24688
+; EXACTCUTOFF-NEXT: ds_read_b128 a[88:91], v1 offset:24672
+; EXACTCUTOFF-NEXT: ds_read_b128 a[84:87], v1 offset:24656
+; EXACTCUTOFF-NEXT: ds_read_b128 a[80:83], v1 offset:24640
+; EXACTCUTOFF-NEXT: ds_read_b128 a[76:79], v1 offset:24624
+; EXACTCUTOFF-NEXT: ds_read_b128 a[72:75], v1 offset:24608
+; EXACTCUTOFF-NEXT: ds_read_b128 a[68:71], v1 offset:24592
+; EXACTCUTOFF-NEXT: ds_read_b128 a[64:67], v1 offset:24576
+; EXACTCUTOFF-NEXT: ds_read_b128 a[124:127], v1 offset:49264
+; EXACTCUTOFF-NEXT: ds_read_b128 a[120:123], v1 offset:49248
+; EXACTCUTOFF-NEXT: ds_read_b128 a[116:119], v1 offset:49232
+; EXACTCUTOFF-NEXT: ds_read_b128 a[112:115], v1 offset:49216
+; EXACTCUTOFF-NEXT: ds_read_b128 a[108:111], v1 offset:49200
+; EXACTCUTOFF-NEXT: ds_read_b128 a[104:107], v1 offset:49184
+; EXACTCUTOFF-NEXT: ds_read_b128 a[100:103], v1 offset:49168
+; EXACTCUTOFF-NEXT: ds_read_b128 a[96:99], v1 offset:49152
+; EXACTCUTOFF-NEXT: ds_read_b128 a[156:159], v4 offset:57456
+; EXACTCUTOFF-NEXT: ds_read_b128 a[152:155], v4 offset:57440
+; EXACTCUTOFF-NEXT: ds_read_b128 a[148:151], v4 offset:57424
+; EXACTCUTOFF-NEXT: ds_read_b128 a[144:147], v4 offset:57408
+; EXACTCUTOFF-NEXT: ds_read_b128 a[128:131], v4 offset:57344
+; EXACTCUTOFF-NEXT: ds_read_b128 a[132:135], v4 offset:57360
+; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v4 offset:57376
+; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v4 offset:57392
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:96
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:80
@@ -1035,104 +1057,64 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16
; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3]
-; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304
-; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288
-; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272
-; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256
-; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:8240
-; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224
-; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208
-; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192
-; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:8288
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:8304
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:8256
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[52:55] offset:8272
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[40:43] offset:8224
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[44:47] offset:8240
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[32:35] offset:8192
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[36:39] offset:8208
+; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14)
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95]
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 7
-; EXACTCUTOFF-NEXT: s_nop 1
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:8288
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:8304
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:8256
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:8272
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:8224
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:8240
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:8192
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:8208
-; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:24688
-; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:24672
-; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:24656
-; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:24640
-; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:24624
-; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:24608
-; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:24592
-; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:24576
-; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; EXACTCUTOFF-NEXT: s_nop 2
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[88:91] offset:16480
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[92:95] offset:16496
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[80:83] offset:16448
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[84:87] offset:16464
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[72:75] offset:16416
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[76:79] offset:16432
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[64:67] offset:16384
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[68:71] offset:16400
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127]
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 2
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:16480
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:16496
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:16448
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:16464
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:16416
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:16432
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:16384
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16400
-; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:49264
-; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:49248
-; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:49232
-; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:49216
-; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:49200
-; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:49184
-; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:49168
-; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:49152
-; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, 0x6000, v1
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[120:123] offset:24672
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[124:127] offset:24688
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[112:115] offset:24640
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[116:119] offset:24656
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[104:107] offset:24608
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[108:111] offset:24624
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[96:99] offset:24576
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[100:103] offset:24592
+; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159]
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; EXACTCUTOFF-NEXT: s_nop 7
; EXACTCUTOFF-NEXT: s_nop 7
-; EXACTCUTOFF-NEXT: s_nop 1
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:24672
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:24688
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:24640
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:24656
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:24608
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:24624
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:24576
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:24592
-; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:57456
-; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:57440
-; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:57424
-; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:57408
-; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:57344
-; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:57360
-; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:57376
-; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:57392
-; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; EXACTCUTOFF-NEXT: s_nop 2
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:32864
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:32880
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[144:147] offset:32832
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:32848
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[136:139] offset:32800
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[140:143] offset:32816
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[128:131] offset:32768
+; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[132:135] offset:32784
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT: s_nop 7
-; EXACTCUTOFF-NEXT: s_nop 7
-; EXACTCUTOFF-NEXT: s_nop 2
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:32864
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:32880
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:32832
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:32848
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32800
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:32816
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:32768
-; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:32784
; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; EXACTCUTOFF-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll b/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll
index 190384255bf23..efece9d02950d 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll
@@ -1119,21 +1119,44 @@ define amdgpu_kernel void @kern_align32_global_ptr(ptr addrspace(1) align 1024 %
}
define amdgpu_kernel void @kern_noalias_global_ptr(ptr addrspace(1) noalias %ptr) #0 {
-; GCN-LABEL: @kern_noalias_global_ptr(
-; GCN-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; GCN-NEXT: store volatile ptr addrspace(1) [[PTR:%.*]], ptr addrspace(1) poison, align 8
-; GCN-NEXT: ret void
+; HSA-LABEL: @kern_noalias_global_ptr(
+; HSA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT: [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; HSA-NEXT: store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META5:![0-9]+]]
+; HSA-NEXT: ret void
+;
+; MESA-LABEL: @kern_noalias_global_ptr(
+; MESA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT: [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
+; MESA-NEXT: store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META5:![0-9]+]]
+; MESA-NEXT: ret void
;
store volatile ptr addrspace(1) %ptr, ptr addrspace(1) poison
ret void
}
define amdgpu_kernel void @kern_noalias_global_ptr_x2(ptr addrspace(1) noalias %ptr0, ptr addrspace(1) noalias %ptr1) #0 {
-; GCN-LABEL: @kern_noalias_global_ptr_x2(
-; GCN-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; GCN-NEXT: store volatile ptr addrspace(1) [[PTR0:%.*]], ptr addrspace(1) poison, align 8
-; GCN-NEXT: store volatile ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(1) poison, align 8
-; GCN-NEXT: ret void
+; HSA-LABEL: @kern_noalias_global_ptr_x2(
+; HSA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT: [[PTR0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT: [[PTR0_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR0_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; HSA-NEXT: [[PTR1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 8
+; HSA-NEXT: [[PTR1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
+; HSA-NEXT: store volatile ptr addrspace(1) [[PTR0_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META8:![0-9]+]]
+; HSA-NEXT: store volatile ptr addrspace(1) [[PTR1_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META8]]
+; HSA-NEXT: ret void
+;
+; MESA-LABEL: @kern_noalias_global_ptr_x2(
+; MESA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT: [[PTR0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT: [[PTR0_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR0_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
+; MESA-NEXT: [[PTR1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 44
+; MESA-NEXT: [[PTR1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
+; MESA-NEXT: store volatile ptr addrspace(1) [[PTR0_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META8:![0-9]+]]
+; MESA-NEXT: store volatile ptr addrspace(1) [[PTR1_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META8]]
+; MESA-NEXT: ret void
;
store volatile ptr addrspace(1) %ptr0, ptr addrspace(1) poison
store volatile ptr addrspace(1) %ptr1, ptr addrspace(1) poison
@@ -1855,10 +1878,24 @@ attributes #2 = { nounwind "target-cpu"="tahiti" }
; HSA: [[META2]] = !{i64 42}
; HSA: [[META3]] = !{i64 128}
; HSA: [[META4]] = !{i64 1024}
+; HSA: [[META5]] = !{[[META6:![0-9]+]]}
+; HSA: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]], !"kern_noalias_global_ptr: %ptr"}
+; HSA: [[META7]] = distinct !{[[META7]], !"kern_noalias_global_ptr"}
+; HSA: [[META8]] = !{[[META9:![0-9]+]], [[META11:![0-9]+]]}
+; HSA: [[META9]] = distinct !{[[META9]], [[META10:![0-9]+]], !"kern_noalias_global_ptr_x2: %ptr0"}
+; HSA: [[META10]] = distinct !{[[META10]], !"kern_noalias_global_ptr_x2"}
+; HSA: [[META11]] = distinct !{[[META11]], [[META10]], !"kern_noalias_global_ptr_x2: %ptr1"}
;.
; MESA: [[META0]] = !{}
; MESA: [[RNG1]] = !{i32 0, i32 8}
; MESA: [[META2]] = !{i64 42}
; MESA: [[META3]] = !{i64 128}
; MESA: [[META4]] = !{i64 1024}
+; MESA: [[META5]] = !{[[META6:![0-9]+]]}
+; MESA: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]], !"kern_noalias_global_ptr: %ptr"}
+; MESA: [[META7]] = distinct !{[[META7]], !"kern_noalias_global_ptr"}
+; MESA: [[META8]] = !{[[META9:![0-9]+]], [[META11:![0-9]+]]}
+; MESA: [[META9]] = distinct !{[[META9]], [[META10:![0-9]+]], !"kern_noalias_global_ptr_x2: %ptr0"}
+; MESA: [[META10]] = distinct !{[[META10]], !"kern_noalias_global_ptr_x2"}
+; MESA: [[META11]] = distinct !{[[META11]], [[META10]], !"kern_noalias_global_ptr_x2: %ptr1"}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
index 0ac3d652050d3..ba59b94b6d141 100644
--- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
@@ -7,26 +7,44 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr a
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0
+; SDAG-NEXT: buffer_store_dword v0, off, s[12:15], 0
+; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0
+; SDAG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4
+; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:8
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0
+; SDAG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:8
+; SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:12
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0
-; SDAG-NEXT: v_mul_f32_e32 v1, v1, v1
-; SDAG-NEXT: v_mul_f32_e32 v2, v2, v2
-; SDAG-NEXT: v_mul_f32_e32 v3, v3, v3
-; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
+; SDAG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:12
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: buffers_dont_alias:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GISEL-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0
+; GISEL-NEXT: buffer_store_dword v0, off, s[12:15], 0
+; GISEL-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0
+; GISEL-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4
+; GISEL-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:8
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0
+; GISEL-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:8
+; GISEL-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:12
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0
-; GISEL-NEXT: v_mul_f32_e32 v1, v1, v1
-; GISEL-NEXT: v_mul_f32_e32 v2, v2, v2
-; GISEL-NEXT: v_mul_f32_e32 v3, v3, v3
-; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
+; GISEL-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:12
; GISEL-NEXT: s_endpgm
%l0 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 0, i32 0, i32 0)
%s0 = fmul float %l0, %l0
@@ -56,15 +74,26 @@ define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: s_and_b32 s5, s1, 0xffff
; SDAG-NEXT: s_mov_b32 s4, s0
-; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; SDAG-NEXT: s_and_b32 s5, s3, 0xffff
-; SDAG-NEXT: s_mov_b32 s4, s2
+; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; SDAG-NEXT: s_and_b32 s1, s3, 0xffff
+; SDAG-NEXT: s_mov_b32 s0, s2
+; SDAG-NEXT: s_mov_b32 s2, s6
+; SDAG-NEXT: s_mov_b32 s3, s7
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0
+; SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0
-; SDAG-NEXT: v_mul_f32_e32 v1, v1, v1
-; SDAG-NEXT: v_mul_f32_e32 v2, v2, v2
-; SDAG-NEXT: v_mul_f32_e32 v3, v3, v3
-; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
+; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0
+; SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
+; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:12
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0
+; SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: buffers_from_flat_dont_alias:
@@ -72,18 +101,29 @@ define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GISEL-NEXT: s_mov_b32 s7, 0
; GISEL-NEXT: s_mov_b32 s6, 16
+; GISEL-NEXT: s_mov_b32 s10, s6
+; GISEL-NEXT: s_mov_b32 s11, s7
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: s_and_b32 s5, s1, 0xffff
; GISEL-NEXT: s_mov_b32 s4, s0
-; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT: s_and_b32 s5, s3, 0xffff
-; GISEL-NEXT: s_mov_b32 s4, s2
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT: s_and_b32 s9, s3, 0xffff
+; GISEL-NEXT: s_mov_b32 s8, s2
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0
+; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:4
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0
+; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0
+; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:8
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:12
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0
-; GISEL-NEXT: v_mul_f32_e32 v1, v1, v1
-; GISEL-NEXT: v_mul_f32_e32 v2, v2, v2
-; GISEL-NEXT: v_mul_f32_e32 v3, v3, v3
-; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:12
; GISEL-NEXT: s_endpgm
%a = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %a.flat, i16 0, i32 16, i32 0)
%b = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %b.flat, i16 0, i32 16, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index ec065b4daa376..73438a7462531 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -647,13 +647,15 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1
define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 %b) nounwind {
; GFX6-LABEL: s_sub_i64:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_sub_u32 s0, s0, s2
-; GFX6-NEXT: s_subb_u32 s1, s1, s3
+; GFX6-NEXT: s_mov_b32 s4, s0
+; GFX6-NEXT: s_sub_u32 s0, s2, s8
+; GFX6-NEXT: s_mov_b32 s5, s1
+; GFX6-NEXT: s_subb_u32 s1, s3, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -661,41 +663,41 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64
;
; GFX8-LABEL: s_sub_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sub_u32 s0, s0, s2
-; GFX8-NEXT: s_subb_u32 s1, s1, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: s_sub_u32 s0, s2, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_subb_u32 s1, s3, s5
; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_sub_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s0, s0, s2
-; GFX9-NEXT: s_subb_u32 s1, s1, s3
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT: s_sub_u32 s2, s2, s6
+; GFX9-NEXT: s_subb_u32 s3, s3, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: s_sub_i64:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_sub_nc_u64 s[2:3], s[2:3], s[4:5]
; GFX12-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
%result = sub i64 %a, %b
store i64 %result, ptr addrspace(1) %out, align 8
@@ -740,12 +742,12 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_sub_i64:
@@ -832,14 +834,14 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NEXT: v_mov_b32_e32 v9, s1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v4
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_sub_v2i64:
>From 7c2ef51a5284801d6342dbcea0bdfd027caab2e1 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Thu, 19 Jun 2025 04:14:51 +0100
Subject: [PATCH 2/2] Change comment style.
---
llvm/lib/Transforms/Utils/InlineFunction.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index a56dc39e569c0..ffa23823b030a 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1394,11 +1394,11 @@ void llvm::addAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
}
void llvm::addAliasScopeMetadata(Function &F) {
- addAliasScopeMetadataImpl(/* CB */ nullptr, &F, /* VMap */ nullptr,
+ addAliasScopeMetadataImpl(/*CB=*/ nullptr, &F, /*VMap=*/ nullptr,
F.getParent()->getDataLayout(),
- /* CalleeAAR */ nullptr,
- /* InlinedFunctionInfo */ nullptr,
- /* UseNoAliasIntrinsic */ false);
+ /*CalleeAAR=*/ nullptr,
+ /*InlinedFunctionInfo=*/ nullptr,
+ /*UseNoAliasIntrinsic=*/ false);
}
static bool MayContainThrowingOrExitingCallAfterCB(CallBase *Begin,
More information about the llvm-commits
mailing list