[llvm] [AMDGPU] Propagate alias information in AMDGPULowerKernelArguments. (PR #144714)

Leon Clark via llvm-commits llvm-commits at lists.llvm.org
Wed Jun 18 20:19:57 PDT 2025


https://github.com/PeddleSpam updated https://github.com/llvm/llvm-project/pull/144714

>From c4745be7beaf4873538991a0d8aa9063e79cfc5c Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Wed, 18 Jun 2025 15:39:48 +0100
Subject: [PATCH 1/3] [AMDGPU] Propagate alias information in
 AMDGPULowerKernelArguments.

This patch reimplements https://reviews.llvm.org/D108363 and https://reviews.llvm.org/D108361 to emit !noalias and !alias.scope metadata for noalias kernel arguments.
---
 llvm/include/llvm/Transforms/Utils/Cloning.h  |   11 +
 .../AMDGPU/AMDGPULowerKernelArguments.cpp     |   11 +-
 llvm/lib/Transforms/Utils/InlineFunction.cpp  |  100 +-
 llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll   |  200 +-
 llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll   |  280 +--
 llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll     |   78 +-
 .../llvm.amdgcn.sched.group.barrier.gfx11.ll  |  198 +-
 ...vm.amdgcn.sched.group.barrier.iterative.ll | 2109 ++++++++++++-----
 .../AMDGPU/llvm.amdgcn.sched.group.barrier.ll |  526 ++--
 llvm/test/CodeGen/AMDGPU/lower-kernargs.ll    |   55 +-
 .../AMDGPU/ptr-buffer-alias-scheduling.ll     |   88 +-
 llvm/test/CodeGen/AMDGPU/sub.ll               |   58 +-
 12 files changed, 2438 insertions(+), 1276 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/Cloning.h b/llvm/include/llvm/Transforms/Utils/Cloning.h
index 6b56230a6e1d4..05490e6c81bc8 100644
--- a/llvm/include/llvm/Transforms/Utils/Cloning.h
+++ b/llvm/include/llvm/Transforms/Utils/Cloning.h
@@ -363,6 +363,17 @@ LLVM_ABI void updateProfileCallee(
     Function *Callee, int64_t EntryDelta,
     const ValueMap<const Value *, WeakTrackingVH> *VMap = nullptr);
 
+/// Adds `!noalias` and `!alias.scope` metadata for `CB`'s called function's
+/// `noalias` argument based memory accesses.
+void addAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
+                           const DataLayout &DL, AAResults *CalleeAAR,
+                           ClonedCodeInfo &InlinedFunctionInfo,
+                           bool UseNoAliasIntrinsic);
+
+/// Adds `!noalias` and `!alias.scope` metadata for `F`'s `noalias` argument
+/// based memory accesses.
+void addAliasScopeMetadata(Function &F);
+
 /// Find the 'llvm.experimental.noalias.scope.decl' intrinsics in the specified
 /// basic blocks and extract their scope. These are candidates for duplication
 /// when cloning.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index dec781d71c54e..edd19e1ef1241 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 
 #define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
 
@@ -86,6 +87,9 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
       Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
 
   uint64_t ExplicitArgOffset = 0;
+
+  addAliasScopeMetadata(F);
+
   for (Argument &Arg : F.args()) {
     const bool IsByRef = Arg.hasByRefAttr();
     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
@@ -124,11 +128,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
            PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) &&
           !ST.hasUsableDSOffset())
         continue;
-
-      // FIXME: We can replace this with equivalent alias.scope/noalias
-      // metadata, but this appears to be a lot of work.
-      if (Arg.hasNoAliasAttr())
-        continue;
     }
 
     auto *VT = dyn_cast<FixedVectorType>(ArgTy);
@@ -215,8 +214,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
       }
     }
 
-    // TODO: Convert noalias arg to !noalias
-
     if (DoShiftOpt) {
       Value *ExtractBits = OffsetDiff == 0 ?
         Load : Builder.CreateLShr(Load, OffsetDiff * 8);
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 7df5e9958182c..a56dc39e569c0 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -51,6 +51,7 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -1114,17 +1115,30 @@ void ScopedAliasMetadataDeepCloner::remap(Function::iterator FStart,
 /// then add new alias scopes for each noalias argument, tag the mapped noalias
 /// parameters with noalias metadata specifying the new scope, and tag all
 /// non-derived loads, stores and memory intrinsics with the new alias scopes.
-static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
-                                  const DataLayout &DL, AAResults *CalleeAAR,
-                                  ClonedCodeInfo &InlinedFunctionInfo) {
-  if (!EnableNoAliasConversion)
-    return;
-
-  const Function *CalledFunc = CB.getCalledFunction();
+static void addAliasScopeMetadataImpl(CallBase *CB, Function *F,
+                                      ValueToValueMapTy *VMap,
+                                      const DataLayout &DL,
+                                      AAResults *CalleeAAR,
+                                      ClonedCodeInfo *InlinedFunctionInfo,
+                                      bool UseNoAliasIntrinsic) {
+  assert(CB || F);
+  const Function *CalledFunc = CB ? CB->getCalledFunction() : F;
   SmallVector<const Argument *, 4> NoAliasArgs;
 
+  std::function<bool(const Argument *, Attribute::AttrKind)> paramHasAttr;
+  if (CB) {
+    paramHasAttr = [&](const Argument *Arg, Attribute::AttrKind Attr) -> bool {
+      return CB->paramHasAttr(Arg->getArgNo(), Attr);
+    };
+
+  } else {
+    paramHasAttr = [&](const Argument *Arg, Attribute::AttrKind Attr) -> bool {
+      return Arg->hasAttribute(Attr);
+    };
+  }
+
   for (const Argument &Arg : CalledFunc->args())
-    if (CB.paramHasAttr(Arg.getArgNo(), Attribute::NoAlias) && !Arg.use_empty())
+    if (paramHasAttr(&Arg, Attribute::NoAlias) && !Arg.use_empty())
       NoAliasArgs.push_back(&Arg);
 
   if (NoAliasArgs.empty())
@@ -1166,29 +1180,20 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
     NewScopes.insert(std::make_pair(A, NewScope));
 
     if (UseNoAliasIntrinsic) {
+      assert(CB);
       // Introduce a llvm.experimental.noalias.scope.decl for the noalias
       // argument.
       MDNode *AScopeList = MDNode::get(CalledFunc->getContext(), NewScope);
       auto *NoAliasDecl =
-          IRBuilder<>(&CB).CreateNoAliasScopeDeclaration(AScopeList);
+          IRBuilder<>(CB).CreateNoAliasScopeDeclaration(AScopeList);
       // Ignore the result for now. The result will be used when the
       // llvm.noalias intrinsic is introduced.
       (void)NoAliasDecl;
     }
   }
 
-  // Iterate over all new instructions in the map; for all memory-access
-  // instructions, add the alias scope metadata.
-  for (ValueToValueMapTy::iterator VMI = VMap.begin(), VMIE = VMap.end();
-       VMI != VMIE; ++VMI) {
-    if (const Instruction *I = dyn_cast<Instruction>(VMI->first)) {
-      if (!VMI->second)
-        continue;
-
-      Instruction *NI = dyn_cast<Instruction>(VMI->second);
-      if (!NI || InlinedFunctionInfo.isSimplified(I, NI))
-        continue;
-
+  {
+    auto addAliasMD = [&](const Instruction *I, Instruction *NI) -> void {
       bool IsArgMemOnlyCall = false, IsFuncCall = false;
       SmallVector<const Value *, 2> PtrArgs;
 
@@ -1207,7 +1212,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
         // know that about the inlined clone of this call site, and we don't
         // need to add metadata.
         if (Call->doesNotAccessMemory())
-          continue;
+          return;
 
         IsFuncCall = true;
         if (CalleeAAR) {
@@ -1215,7 +1220,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
 
           // We'll retain this knowledge without additional metadata.
           if (ME.onlyAccessesInaccessibleMem())
-            continue;
+            return;
 
           if (ME.onlyAccessesArgPointees())
             IsArgMemOnlyCall = true;
@@ -1237,7 +1242,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
       // However, if this is a call, this we might just alias with none of the
       // noalias arguments.
       if (PtrArgs.empty() && !IsFuncCall)
-        continue;
+        return;
 
       // It is possible that there is only one underlying object, but you
       // need to go through several PHIs to see it, and thus could be
@@ -1270,7 +1275,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
         // completely describe the aliasing properties using alias.scope
         // metadata (and, thus, won't add any).
         if (const Argument *A = dyn_cast<Argument>(V)) {
-          if (!CB.paramHasAttr(A->getArgNo(), Attribute::NoAlias))
+          if (!paramHasAttr(A, Attribute::NoAlias))
             UsesAliasingPtr = true;
         } else {
           UsesAliasingPtr = true;
@@ -1292,7 +1297,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
       // Nothing we can do if the used underlying object cannot be reliably
       // determined.
       if (UsesUnknownObject)
-        continue;
+        return;
 
       // A function call can always get captured noalias pointers (via other
       // parameters, globals, etc.).
@@ -1353,10 +1358,49 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
             LLVMContext::MD_alias_scope,
             MDNode::concatenate(NI->getMetadata(LLVMContext::MD_alias_scope),
                                 MDNode::get(CalledFunc->getContext(), Scopes)));
+    };
+
+    if (VMap) {
+      assert(InlinedFunctionInfo);
+
+      for (ValueToValueMapTy::iterator VMI = VMap->begin(), VMIE = VMap->end();
+           VMI != VMIE; ++VMI) {
+        const Instruction *I = dyn_cast<Instruction>(VMI->first);
+        if (!I || !VMI->second)
+          continue;
+
+        Instruction *NI = dyn_cast<Instruction>(VMI->second);
+        if (!NI || InlinedFunctionInfo->isSimplified(I, NI))
+          continue;
+
+        addAliasMD(I, NI);
+      }
+
+    } else {
+      for (auto It = inst_begin(F), End = inst_end(F); It != End; ++It) {
+        Instruction *I = &(*It);
+        addAliasMD(I, I);
+      }
     }
   }
 }
 
+void llvm::addAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
+                                 const DataLayout &DL, AAResults *CalleeAAR,
+                                 ClonedCodeInfo &InlinedFunctionInfo,
+                                 bool UseNoAliasIntrinsic) {
+  addAliasScopeMetadataImpl(&CB, /* F */ nullptr, &VMap, DL, CalleeAAR,
+                            &InlinedFunctionInfo, UseNoAliasIntrinsic);
+}
+
+void llvm::addAliasScopeMetadata(Function &F) {
+  addAliasScopeMetadataImpl(/* CB */ nullptr, &F, /* VMap */ nullptr,
+                            F.getParent()->getDataLayout(),
+                            /* CalleeAAR */ nullptr,
+                            /* InlinedFunctionInfo */ nullptr,
+                            /* UseNoAliasIntrinsic */ false);
+}
+
 static bool MayContainThrowingOrExitingCallAfterCB(CallBase *Begin,
                                                    ReturnInst *End) {
 
@@ -2797,7 +2841,9 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
     SAMetadataCloner.remap(FirstNewBlock, Caller->end());
 
     // Add noalias metadata if necessary.
-    AddAliasScopeMetadata(CB, VMap, DL, CalleeAAR, InlinedFunctionInfo);
+    if (EnableNoAliasConversion)
+      addAliasScopeMetadata(CB, VMap, DL, CalleeAAR, InlinedFunctionInfo,
+                            UseNoAliasIntrinsic);
 
     // Clone return attributes on the callsite into the calls within the inlined
     // function which feed into its return value.
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 99b7c7737f4ae..a87baca5a5878 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -105,11 +105,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out,
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbh_u32_e32 v2, v0
+; VI-NEXT:    flat_load_dword v2, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_ffbh_u32_e32 v2, v2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -181,8 +181,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v1, v1
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
@@ -261,8 +261,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v3, v3
 ; VI-NEXT:    v_ffbh_u32_e32 v2, v2
@@ -534,13 +534,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; VI-LABEL: s_ctlz_zero_undef_i64_with_select:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_flbit_i32_b64 s2, s[2:3]
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_flbit_i32_b64 s0, s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_ctlz_zero_undef_i64_with_select:
@@ -605,15 +605,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 24, v0
-; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; VI-NEXT:    v_ffbh_u32_e32 v1, v1
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
+; VI-NEXT:    flat_load_ubyte v2, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 24, v2
+; VI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; VI-NEXT:    v_ffbh_u32_e32 v3, v3
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v3, vcc
 ; VI-NEXT:    flat_store_byte v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -706,21 +706,21 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
-; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_readfirstlane_b32 s2, v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_readfirstlane_b32 s3, v0
-; VI-NEXT:    s_lshl_b32 s2, s2, 8
-; VI-NEXT:    s_or_b32 s2, s2, s3
-; VI-NEXT:    s_lshl_b32 s3, s2, 16
-; VI-NEXT:    s_and_b32 s2, s2, 0xffff
-; VI-NEXT:    s_flbit_i32_b32 s3, s3
-; VI-NEXT:    s_cmp_lg_u32 s2, 0
-; VI-NEXT:    s_cselect_b32 s2, s3, 32
+; VI-NEXT:    flat_load_ubyte v3, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_readfirstlane_b32 s0, v2
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_readfirstlane_b32 s1, v3
+; VI-NEXT:    s_lshl_b32 s0, s0, 8
+; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    s_lshl_b32 s1, s0, 16
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    s_flbit_i32_b32 s1, s1
+; VI-NEXT:    s_cmp_lg_u32 s0, 0
+; VI-NEXT:    s_cselect_b32 s0, s1, 32
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -814,37 +814,37 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s4, s2, 3
+; VI-NEXT:    s_add_u32 s4, s2, 1
 ; VI-NEXT:    s_addc_u32 s5, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    s_add_u32 s4, s2, 2
+; VI-NEXT:    s_add_u32 s4, s2, 3
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    s_addc_u32 s5, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    s_add_u32 s2, s2, 1
-; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    s_add_u32 s2, s2, 2
 ; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v7, s3
+; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_mov_b32_e32 v7, s3
 ; VI-NEXT:    v_mov_b32_e32 v6, s2
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
-; VI-NEXT:    flat_load_ubyte v3, v[4:5]
-; VI-NEXT:    flat_load_ubyte v4, v[6:7]
-; VI-NEXT:    flat_load_ubyte v0, v[0:1]
+; VI-NEXT:    flat_load_ubyte v3, v[0:1]
+; VI-NEXT:    flat_load_ubyte v4, v[4:5]
+; VI-NEXT:    flat_load_ubyte v5, v[6:7]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 8, v4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_e32 v0, v2, v0
-; VI-NEXT:    v_or_b32_e32 v0, v1, v0
-; VI-NEXT:    v_ffbh_u32_e32 v0, v0
-; VI-NEXT:    v_min_u32_e32 v2, 32, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-NEXT:    v_ffbh_u32_e32 v2, v2
+; VI-NEXT:    v_min_u32_e32 v2, 32, v2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -965,29 +965,30 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; VI-NEXT:    s_addc_u32 s5, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v7, s5
 ; VI-NEXT:    v_mov_b32_e32 v6, s4
-; VI-NEXT:    s_add_u32 s4, s2, 3
+; VI-NEXT:    s_add_u32 s4, s2, 1
 ; VI-NEXT:    s_addc_u32 s5, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v9, s5
 ; VI-NEXT:    v_mov_b32_e32 v8, s4
-; VI-NEXT:    s_add_u32 s4, s2, 2
+; VI-NEXT:    s_add_u32 s4, s2, 3
 ; VI-NEXT:    flat_load_ubyte v10, v[0:1]
 ; VI-NEXT:    flat_load_ubyte v11, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v12, v[4:5]
 ; VI-NEXT:    flat_load_ubyte v6, v[6:7]
 ; VI-NEXT:    flat_load_ubyte v7, v[8:9]
+; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    s_add_u32 s4, s2, 1
-; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_add_u32 s2, s2, 2
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_mov_b32_e32 v5, s3
 ; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    flat_load_ubyte v0, v[0:1]
+; VI-NEXT:    flat_load_ubyte v8, v[0:1]
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v3, v[4:5]
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v10
 ; VI-NEXT:    s_waitcnt vmcnt(6)
@@ -1001,19 +1002,18 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v7
 ; VI-NEXT:    v_ffbh_u32_e32 v4, v4
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v0, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v5, v5, v8
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    v_or_b32_e32 v0, v0, v2
-; VI-NEXT:    v_ffbh_u32_e32 v0, v0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
-; VI-NEXT:    v_min_u32_e32 v0, v0, v4
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_min_u32_e32 v0, 64, v0
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v2, v2, v5
+; VI-NEXT:    v_ffbh_u32_e32 v2, v2
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 32, v2
+; VI-NEXT:    v_min_u32_e32 v2, v2, v4
+; VI-NEXT:    v_min_u32_e32 v2, 64, v2
+; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_ctlz_zero_undef_i64_with_select:
@@ -1119,12 +1119,12 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; VI-NEXT:    v_ffbh_u32_e32 v2, v0
+; VI-NEXT:    flat_load_ubyte v2, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; VI-NEXT:    v_ffbh_u32_e32 v2, v2
 ; VI-NEXT:    flat_store_byte v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1259,10 +1259,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_flbit_i32_b64 s2, s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_flbit_i32_b64 s0, s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1505,11 +1505,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbh_u32_e32 v2, v0
+; VI-NEXT:    flat_load_dword v2, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_ffbh_u32_e32 v2, v2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1584,11 +1584,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbh_u32_e32 v2, v0
+; VI-NEXT:    flat_load_dword v2, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_ffbh_u32_e32 v2, v2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1661,11 +1661,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbh_u32_e32 v2, v0
+; VI-NEXT:    flat_load_ubyte v2, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_ffbh_u32_e32 v2, v2
 ; VI-NEXT:    flat_store_byte v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1858,13 +1858,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbh_u32_e32 v1, v0
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v2, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_ffbh_u32_e32 v3, v2
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1942,13 +1942,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbh_u32_e32 v1, v0
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v2, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_ffbh_u32_e32 v3, v2
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -2026,13 +2026,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbh_u32_e32 v1, v0
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
-; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v2, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_ffbh_u32_e32 v3, v2
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v2
+; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -2111,13 +2111,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbh_u32_e32 v1, v0
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
-; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v2, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_ffbh_u32_e32 v3, v2
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v2
+; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 73fddb53d1dcc..23d5cb73e8dd4 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -92,11 +92,11 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out,
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_dword v0, v[0:1]
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbl_b32_e32 v2, v0
+; VI-NEXT:    flat_load_dword v2, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_ffbl_b32_e32 v2, v2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -168,8 +168,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v1, v1
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
@@ -248,8 +248,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v3, v3
 ; VI-NEXT:    v_ffbl_b32_e32 v2, v2
@@ -511,13 +511,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; VI-LABEL: s_cttz_zero_undef_i64_with_select:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ff1_i32_b64 s2, s[2:3]
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_ff1_i32_b64 s0, s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_cttz_zero_undef_i64_with_select:
@@ -581,14 +581,14 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbl_b32_e32 v1, v0
-; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
+; VI-NEXT:    flat_load_ubyte v2, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_ffbl_b32_e32 v3, v2
+; VI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v3, vcc
 ; VI-NEXT:    flat_store_byte v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -677,17 +677,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
-; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_e32 v0, v1, v0
-; VI-NEXT:    v_ffbl_b32_e32 v1, v0
-; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
+; VI-NEXT:    flat_load_ubyte v3, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-NEXT:    v_ffbl_b32_e32 v3, v2
+; VI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v3, vcc
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -778,37 +778,37 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s4, s2, 3
+; VI-NEXT:    s_add_u32 s4, s2, 1
 ; VI-NEXT:    s_addc_u32 s5, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    s_add_u32 s4, s2, 2
+; VI-NEXT:    s_add_u32 s4, s2, 3
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    s_addc_u32 s5, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    s_add_u32 s2, s2, 1
-; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    s_add_u32 s2, s2, 2
 ; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v7, s3
+; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_mov_b32_e32 v7, s3
 ; VI-NEXT:    v_mov_b32_e32 v6, s2
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
-; VI-NEXT:    flat_load_ubyte v3, v[4:5]
-; VI-NEXT:    flat_load_ubyte v4, v[6:7]
-; VI-NEXT:    flat_load_ubyte v0, v[0:1]
+; VI-NEXT:    flat_load_ubyte v3, v[0:1]
+; VI-NEXT:    flat_load_ubyte v4, v[4:5]
+; VI-NEXT:    flat_load_ubyte v5, v[6:7]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 8, v4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_e32 v0, v2, v0
-; VI-NEXT:    v_or_b32_e32 v0, v1, v0
-; VI-NEXT:    v_ffbl_b32_e32 v0, v0
-; VI-NEXT:    v_min_u32_e32 v2, 32, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-NEXT:    v_ffbl_b32_e32 v2, v2
+; VI-NEXT:    v_min_u32_e32 v2, 32, v2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -929,55 +929,55 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; VI-NEXT:    s_addc_u32 s5, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v7, s5
 ; VI-NEXT:    v_mov_b32_e32 v6, s4
-; VI-NEXT:    s_add_u32 s4, s2, 3
+; VI-NEXT:    s_add_u32 s4, s2, 1
 ; VI-NEXT:    s_addc_u32 s5, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v9, s5
 ; VI-NEXT:    v_mov_b32_e32 v8, s4
-; VI-NEXT:    s_add_u32 s4, s2, 2
-; VI-NEXT:    flat_load_ubyte v10, v[0:1]
-; VI-NEXT:    flat_load_ubyte v11, v[2:3]
-; VI-NEXT:    flat_load_ubyte v12, v[4:5]
-; VI-NEXT:    flat_load_ubyte v6, v[6:7]
-; VI-NEXT:    flat_load_ubyte v7, v[8:9]
+; VI-NEXT:    s_add_u32 s4, s2, 3
+; VI-NEXT:    v_mov_b32_e32 v11, s3
 ; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v10, s2
+; VI-NEXT:    flat_load_ubyte v12, v[0:1]
+; VI-NEXT:    flat_load_ubyte v13, v[2:3]
+; VI-NEXT:    flat_load_ubyte v4, v[4:5]
+; VI-NEXT:    flat_load_ubyte v5, v[6:7]
+; VI-NEXT:    flat_load_ubyte v6, v[8:9]
+; VI-NEXT:    flat_load_ubyte v7, v[10:11]
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_add_u32 s2, s2, 2
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    s_add_u32 s4, s2, 1
-; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    v_mov_b32_e32 v5, s3
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    flat_load_ubyte v0, v[0:1]
+; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    flat_load_ubyte v8, v[0:1]
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
-; VI-NEXT:    flat_load_ubyte v3, v[4:5]
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_waitcnt vmcnt(7)
-; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v10
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 8, v12
 ; VI-NEXT:    s_waitcnt vmcnt(6)
-; VI-NEXT:    v_or_b32_e32 v4, v4, v11
+; VI-NEXT:    v_or_b32_e32 v3, v3, v13
 ; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v12
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v4, v5, v4
+; VI-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v3, v4, v3
 ; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v7
-; VI-NEXT:    v_ffbl_b32_e32 v4, v4
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 32, v4
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v0, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v4, v4, v7
+; VI-NEXT:    v_ffbl_b32_e32 v3, v3
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 32, v3
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v8
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    v_or_b32_e32 v0, v0, v2
-; VI-NEXT:    v_ffbl_b32_e32 v0, v0
-; VI-NEXT:    v_min_u32_e32 v0, v4, v0
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_min_u32_e32 v0, 64, v0
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v2, v2, v4
+; VI-NEXT:    v_ffbl_b32_e32 v2, v2
+; VI-NEXT:    v_min_u32_e32 v2, v3, v2
+; VI-NEXT:    v_min_u32_e32 v2, 64, v2
+; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_cttz_zero_undef_i64_with_select:
@@ -1091,36 +1091,36 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s4, s2, 3
+; VI-NEXT:    s_add_u32 s4, s2, 1
 ; VI-NEXT:    s_addc_u32 s5, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    s_add_u32 s4, s2, 2
+; VI-NEXT:    s_add_u32 s4, s2, 3
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    s_addc_u32 s5, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    s_add_u32 s2, s2, 1
-; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    s_add_u32 s2, s2, 2
 ; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v7, s3
+; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_mov_b32_e32 v7, s3
 ; VI-NEXT:    v_mov_b32_e32 v6, s2
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
-; VI-NEXT:    flat_load_ubyte v3, v[4:5]
-; VI-NEXT:    flat_load_ubyte v4, v[6:7]
-; VI-NEXT:    flat_load_ubyte v0, v[0:1]
+; VI-NEXT:    flat_load_ubyte v3, v[0:1]
+; VI-NEXT:    flat_load_ubyte v4, v[4:5]
+; VI-NEXT:    flat_load_ubyte v5, v[6:7]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 8, v4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_e32 v0, v2, v0
-; VI-NEXT:    v_or_b32_e32 v0, v1, v0
-; VI-NEXT:    v_ffbl_b32_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-NEXT:    v_ffbl_b32_e32 v2, v2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1213,36 +1213,36 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s4, s2, 3
+; VI-NEXT:    s_add_u32 s4, s2, 1
 ; VI-NEXT:    s_addc_u32 s5, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    s_add_u32 s4, s2, 2
+; VI-NEXT:    s_add_u32 s4, s2, 3
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    s_addc_u32 s5, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    s_add_u32 s2, s2, 1
-; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    s_add_u32 s2, s2, 2
 ; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v7, s3
+; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_mov_b32_e32 v7, s3
 ; VI-NEXT:    v_mov_b32_e32 v6, s2
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
-; VI-NEXT:    flat_load_ubyte v3, v[4:5]
-; VI-NEXT:    flat_load_ubyte v4, v[6:7]
-; VI-NEXT:    flat_load_ubyte v0, v[0:1]
+; VI-NEXT:    flat_load_ubyte v3, v[0:1]
+; VI-NEXT:    flat_load_ubyte v4, v[4:5]
+; VI-NEXT:    flat_load_ubyte v5, v[6:7]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 8, v4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_e32 v0, v2, v0
-; VI-NEXT:    v_or_b32_e32 v0, v1, v0
-; VI-NEXT:    v_ffbl_b32_e32 v2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-NEXT:    v_ffbl_b32_e32 v2, v2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1338,39 +1338,39 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s4, s2, 3
+; VI-NEXT:    s_add_u32 s4, s2, 1
 ; VI-NEXT:    s_addc_u32 s5, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    s_add_u32 s4, s2, 2
+; VI-NEXT:    s_add_u32 s4, s2, 3
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    s_addc_u32 s5, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    s_add_u32 s2, s2, 1
-; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    s_add_u32 s2, s2, 2
 ; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v7, s3
+; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_mov_b32_e32 v7, s3
 ; VI-NEXT:    v_mov_b32_e32 v6, s2
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
-; VI-NEXT:    flat_load_ubyte v3, v[4:5]
-; VI-NEXT:    flat_load_ubyte v4, v[6:7]
-; VI-NEXT:    flat_load_ubyte v0, v[0:1]
+; VI-NEXT:    flat_load_ubyte v3, v[0:1]
+; VI-NEXT:    flat_load_ubyte v4, v[4:5]
+; VI-NEXT:    flat_load_ubyte v5, v[6:7]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 8, v4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_e32 v0, v2, v0
-; VI-NEXT:    v_or_b32_e32 v0, v1, v0
-; VI-NEXT:    v_ffbl_b32_e32 v0, v0
-; VI-NEXT:    v_min_u32_e32 v0, 32, v0
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
-; VI-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-NEXT:    v_ffbl_b32_e32 v2, v2
+; VI-NEXT:    v_min_u32_e32 v2, 32, v2
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v2
+; VI-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1455,11 +1455,11 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbl_b32_e32 v2, v0
+; VI-NEXT:    flat_load_ubyte v2, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_ffbl_b32_e32 v2, v2
 ; VI-NEXT:    flat_store_byte v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1541,19 +1541,19 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s4, s2, 1
 ; VI-NEXT:    s_addc_u32 s5, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    flat_load_ubyte v4, v[0:1]
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
-; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_e32 v0, v1, v0
-; VI-NEXT:    v_ffbl_b32_e32 v2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 8, v4
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-NEXT:    v_ffbl_b32_e32 v2, v2
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 745e047348626..167fa469945a6 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1466,10 +1466,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
 ; SI-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[12:15], 0 addr64 offset:2
 ; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    s_mov_b32 s8, s2
-; SI-NEXT:    s_mov_b32 s9, s3
-; SI-NEXT:    s_mov_b32 s2, s10
-; SI-NEXT:    s_mov_b32 s3, s11
+; SI-NEXT:    s_mov_b32 s8, s0
+; SI-NEXT:    s_mov_b32 s9, s1
+; SI-NEXT:    s_mov_b32 s6, s10
+; SI-NEXT:    s_mov_b32 s7, s11
+; SI-NEXT:    s_mov_b32 s4, s2
+; SI-NEXT:    s_mov_b32 s5, s3
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v2
 ; SI-NEXT:    s_waitcnt vmcnt(1)
@@ -1485,15 +1487,15 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_alignbit_b32 v4, v4, v5, 24
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v6
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_mov_b32 s8, 0x4000405
+; VI-NEXT:    s_mov_b32 s12, 0x4000405
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v0
@@ -1515,10 +1517,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
 ; VI-NEXT:    flat_load_ubyte v4, v[0:1]
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s4, s2
-; VI-NEXT:    s_mov_b32 s5, s3
-; VI-NEXT:    s_mov_b32 s2, s6
-; VI-NEXT:    s_mov_b32 s3, s7
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v6
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v6
@@ -1531,9 +1535,9 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
 ; VI-NEXT:    v_or_b32_e32 v4, v5, v4
 ; VI-NEXT:    v_or_b32_e32 v5, v7, v3
 ; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    v_perm_b32 v4, v4, v5, s8
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0
+; VI-NEXT:    v_perm_b32 v4, v4, v5, s12
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
@@ -1628,21 +1632,23 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1
 define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: load_v4i8_to_v4f32_2_uses:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_mov_b32 s3, s7
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    buffer_load_dword v4, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s10, s6
+; SI-NEXT:    s_mov_b32 s11, s7
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s0, s6
-; SI-NEXT:    s_mov_b32 s1, s7
-; SI-NEXT:    s_mov_b32 s6, s2
-; SI-NEXT:    s_mov_b32 s7, s3
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_mov_b32 s8, s2
+; SI-NEXT:    s_mov_b32 s9, s3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; SI-NEXT:    v_and_b32_e32 v6, 0xff00, v4
@@ -1664,29 +1670,31 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x9000000, v0
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: load_v4i8_to_v4f32_2_uses:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    v_mov_b32_e32 v5, 0xffffff00
+; VI-NEXT:    v_mov_b32_e32 v6, 9
+; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    v_mov_b32_e32 v6, 9
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    v_mov_b32_e32 v7, 0x900
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s2
-; VI-NEXT:    s_mov_b32 s5, s3
-; VI-NEXT:    s_mov_b32 s2, s6
-; VI-NEXT:    s_mov_b32 s3, s7
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v4
 ; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
@@ -1696,14 +1704,14 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; VI-NEXT:    v_add_u16_e32 v9, 9, v4
 ; VI-NEXT:    v_and_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_nop 0
 ; VI-NEXT:    v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_or_b32_sdwa v1, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_e32 v0, 0x900, v0
 ; VI-NEXT:    v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: load_v4i8_to_v4f32_2_uses:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll
index 6507976872410..50e4fd5de14c3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll
@@ -180,12 +180,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr
 ; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GCN-NEXT:    v_lshlrev_b32_e32 v16, 5, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v48, 5, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_add_nc_u32_e32 v17, s0, v16
-; GCN-NEXT:    v_add_nc_u32_e32 v16, s1, v16
-; GCN-NEXT:    ds_load_b128 v[4:7], v17 offset:16
-; GCN-NEXT:    ds_load_b128 v[0:3], v17
+; GCN-NEXT:    v_add_nc_u32_e32 v32, s0, v48
+; GCN-NEXT:    v_dual_mov_b32 v57, s1 :: v_dual_add_nc_u32 v56, s1, v48
+; GCN-NEXT:    ds_load_b128 v[4:7], v32 offset:16
+; GCN-NEXT:    ds_load_b128 v[0:3], v32
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
@@ -194,66 +194,61 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr
 ; GCN-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
+; GCN-NEXT:    ds_load_b128 v[4:7], v32 offset:2064
+; GCN-NEXT:    ds_load_b128 v[0:3], v32 offset:2048
+; GCN-NEXT:    ds_load_b128 v[20:23], v32 offset:6160
+; GCN-NEXT:    ds_load_b128 v[16:19], v32 offset:6144
+; GCN-NEXT:    ds_load_b128 v[28:31], v32 offset:12304
+; GCN-NEXT:    ds_load_b128 v[24:27], v32 offset:12288
+; GCN-NEXT:    ds_load_b128 v[36:39], v32 offset:20496
+; GCN-NEXT:    ds_load_b128 v[32:35], v32 offset:20480
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT:    ds_store_b128 v16, v[12:15] offset:16
-; GCN-NEXT:    ds_store_b128 v16, v[8:11]
-; GCN-NEXT:    ds_load_b128 v[4:7], v17 offset:2064
-; GCN-NEXT:    ds_load_b128 v[0:3], v17 offset:2048
-; GCN-NEXT:    v_mov_b32_e32 v16, s1
+; GCN-NEXT:    s_waitcnt lgkmcnt(2)
+; GCN-NEXT:    v_dual_mov_b32 v55, v31 :: v_dual_mov_b32 v54, v30
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_dual_mov_b32 v47, v39 :: v_dual_mov_b32 v46, v38
+; GCN-NEXT:    v_dual_mov_b32 v45, v37 :: v_dual_mov_b32 v44, v36
+; GCN-NEXT:    v_dual_mov_b32 v43, v35 :: v_dual_mov_b32 v42, v34
+; GCN-NEXT:    v_dual_mov_b32 v41, v33 :: v_dual_mov_b32 v40, v32
+; GCN-NEXT:    v_dual_mov_b32 v53, v29 :: v_dual_mov_b32 v52, v28
+; GCN-NEXT:    v_dual_mov_b32 v51, v27 :: v_dual_mov_b32 v50, v26
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[40:47], v[32:39], v[32:39], v[40:47]
+; GCN-NEXT:    v_dual_mov_b32 v39, v7 :: v_dual_mov_b32 v38, v6
+; GCN-NEXT:    v_dual_mov_b32 v37, v5 :: v_dual_mov_b32 v36, v4
+; GCN-NEXT:    v_dual_mov_b32 v35, v3 :: v_dual_mov_b32 v34, v2
+; GCN-NEXT:    v_dual_mov_b32 v33, v1 :: v_dual_mov_b32 v32, v0
+; GCN-NEXT:    v_dual_mov_b32 v49, v25 :: v_dual_mov_b32 v48, v24
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[32:39], v[0:7], v[0:7], v[32:39]
+; GCN-NEXT:    v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v17
+; GCN-NEXT:    v_dual_mov_b32 v2, v18 :: v_dual_mov_b32 v3, v19
+; GCN-NEXT:    v_dual_mov_b32 v4, v20 :: v_dual_mov_b32 v5, v21
+; GCN-NEXT:    v_dual_mov_b32 v6, v22 :: v_dual_mov_b32 v7, v23
+; GCN-NEXT:    ds_store_b128 v56, v[12:15] offset:16
+; GCN-NEXT:    ds_store_b128 v56, v[8:11]
+; GCN-NEXT:    ds_store_b128 v57, v[36:39] offset:2064
+; GCN-NEXT:    ds_store_b128 v57, v[32:35] offset:2048
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[0:7], v[16:23], v[16:23], v[0:7]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
-; GCN-NEXT:    v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
-; GCN-NEXT:    v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
-; GCN-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT:    ds_store_b128 v16, v[12:15] offset:2064
-; GCN-NEXT:    ds_store_b128 v16, v[8:11] offset:2048
-; GCN-NEXT:    ds_load_b128 v[4:7], v17 offset:6160
-; GCN-NEXT:    ds_load_b128 v[0:3], v17 offset:6144
+; GCN-NEXT:    ds_store_b128 v57, v[4:7] offset:4112
+; GCN-NEXT:    ds_store_b128 v57, v[0:3] offset:4096
+; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[48:55], v[24:31], v[24:31], v[48:55]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
-; GCN-NEXT:    v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
-; GCN-NEXT:    v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
-; GCN-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT:    ds_store_b128 v16, v[12:15] offset:4112
-; GCN-NEXT:    ds_store_b128 v16, v[8:11] offset:4096
-; GCN-NEXT:    ds_load_b128 v[4:7], v17 offset:12304
-; GCN-NEXT:    ds_load_b128 v[0:3], v17 offset:12288
+; GCN-NEXT:    ds_store_b128 v57, v[52:55] offset:6160
+; GCN-NEXT:    ds_store_b128 v57, v[48:51] offset:6144
+; GCN-NEXT:    ds_store_b128 v57, v[44:47] offset:8208
+; GCN-NEXT:    ds_store_b128 v57, v[40:43] offset:8192
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
-; GCN-NEXT:    v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
-; GCN-NEXT:    v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
-; GCN-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT:    ds_store_b128 v16, v[12:15] offset:6160
-; GCN-NEXT:    ds_store_b128 v16, v[8:11] offset:6144
-; GCN-NEXT:    ds_load_b128 v[4:7], v17 offset:20496
-; GCN-NEXT:    ds_load_b128 v[0:3], v17 offset:20480
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
-; GCN-NEXT:    v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
-; GCN-NEXT:    v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
-; GCN-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT:    ds_store_b128 v16, v[12:15] offset:8208
-; GCN-NEXT:    ds_store_b128 v16, v[8:11] offset:8192
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
 ; GCN-NEXT:    s_endpgm
 ;
@@ -262,12 +257,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; EXACTCUTOFF-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v16, 5, v0
+; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v48, 5, v0
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v17, s0, v16
-; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v16, s1, v16
-; EXACTCUTOFF-NEXT:    ds_load_b128 v[4:7], v17 offset:16
-; EXACTCUTOFF-NEXT:    ds_load_b128 v[0:3], v17
+; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v32, s0, v48
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v57, s1 :: v_dual_add_nc_u32 v56, s1, v48
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[4:7], v32 offset:16
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[0:3], v32
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
@@ -276,66 +271,61 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
 ; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; EXACTCUTOFF-NEXT:    v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[4:7], v32 offset:2064
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[0:3], v32 offset:2048
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[20:23], v32 offset:6160
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[16:19], v32 offset:6144
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[28:31], v32 offset:12304
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[24:27], v32 offset:12288
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[36:39], v32 offset:20496
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[32:35], v32 offset:20480
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[12:15] offset:16
-; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[8:11]
-; EXACTCUTOFF-NEXT:    ds_load_b128 v[4:7], v17 offset:2064
-; EXACTCUTOFF-NEXT:    ds_load_b128 v[0:3], v17 offset:2048
-; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v16, s1
+; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(2)
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v55, v31 :: v_dual_mov_b32 v54, v30
+; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v47, v39 :: v_dual_mov_b32 v46, v38
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v45, v37 :: v_dual_mov_b32 v44, v36
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v43, v35 :: v_dual_mov_b32 v42, v34
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v41, v33 :: v_dual_mov_b32 v40, v32
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v53, v29 :: v_dual_mov_b32 v52, v28
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v51, v27 :: v_dual_mov_b32 v50, v26
+; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; EXACTCUTOFF-NEXT:    v_wmma_f16_16x16x16_f16 v[40:47], v[32:39], v[32:39], v[40:47]
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v39, v7 :: v_dual_mov_b32 v38, v6
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v37, v5 :: v_dual_mov_b32 v36, v4
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v35, v3 :: v_dual_mov_b32 v34, v2
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v33, v1 :: v_dual_mov_b32 v32, v0
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v49, v25 :: v_dual_mov_b32 v48, v24
+; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; EXACTCUTOFF-NEXT:    v_wmma_f16_16x16x16_f16 v[32:39], v[0:7], v[0:7], v[32:39]
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v17
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v2, v18 :: v_dual_mov_b32 v3, v19
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v4, v20 :: v_dual_mov_b32 v5, v21
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v6, v22 :: v_dual_mov_b32 v7, v23
+; EXACTCUTOFF-NEXT:    ds_store_b128 v56, v[12:15] offset:16
+; EXACTCUTOFF-NEXT:    ds_store_b128 v56, v[8:11]
+; EXACTCUTOFF-NEXT:    ds_store_b128 v57, v[36:39] offset:2064
+; EXACTCUTOFF-NEXT:    ds_store_b128 v57, v[32:35] offset:2048
+; EXACTCUTOFF-NEXT:    v_wmma_f16_16x16x16_f16 v[0:7], v[16:23], v[16:23], v[0:7]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
-; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
-; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
-; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; EXACTCUTOFF-NEXT:    v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[12:15] offset:2064
-; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[8:11] offset:2048
-; EXACTCUTOFF-NEXT:    ds_load_b128 v[4:7], v17 offset:6160
-; EXACTCUTOFF-NEXT:    ds_load_b128 v[0:3], v17 offset:6144
+; EXACTCUTOFF-NEXT:    ds_store_b128 v57, v[4:7] offset:4112
+; EXACTCUTOFF-NEXT:    ds_store_b128 v57, v[0:3] offset:4096
+; EXACTCUTOFF-NEXT:    v_wmma_f16_16x16x16_f16 v[48:55], v[24:31], v[24:31], v[48:55]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
-; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
-; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
-; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; EXACTCUTOFF-NEXT:    v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[12:15] offset:4112
-; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[8:11] offset:4096
-; EXACTCUTOFF-NEXT:    ds_load_b128 v[4:7], v17 offset:12304
-; EXACTCUTOFF-NEXT:    ds_load_b128 v[0:3], v17 offset:12288
+; EXACTCUTOFF-NEXT:    ds_store_b128 v57, v[52:55] offset:6160
+; EXACTCUTOFF-NEXT:    ds_store_b128 v57, v[48:51] offset:6144
+; EXACTCUTOFF-NEXT:    ds_store_b128 v57, v[44:47] offset:8208
+; EXACTCUTOFF-NEXT:    ds_store_b128 v57, v[40:43] offset:8192
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
-; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
-; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
-; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; EXACTCUTOFF-NEXT:    v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[12:15] offset:6160
-; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[8:11] offset:6144
-; EXACTCUTOFF-NEXT:    ds_load_b128 v[4:7], v17 offset:20496
-; EXACTCUTOFF-NEXT:    ds_load_b128 v[0:3], v17 offset:20480
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(2) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v15, v7 :: v_dual_mov_b32 v14, v6
-; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v13, v5 :: v_dual_mov_b32 v12, v4
-; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
-; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; EXACTCUTOFF-NEXT:    v_wmma_f16_16x16x16_f16 v[8:15], v[0:7], v[0:7], v[8:15]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[12:15] offset:8208
-; EXACTCUTOFF-NEXT:    ds_store_b128 v16, v[8:11] offset:8192
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
index 371b4f070094d..f4c21b01bf7f6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
@@ -9,265 +9,957 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-MINREG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GCN-MINREG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GCN-MINREG-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GCN-MINREG-NEXT:    v_mov_b32_e32 v2, 1.0
-; GCN-MINREG-NEXT:    v_mov_b32_e32 v1, 2.0
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v40, 1.0
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v39, 2.0
 ; GCN-MINREG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT:    v_add_u32_e32 v4, s0, v0
-; GCN-MINREG-NEXT:    ds_read_b128 a[28:31], v4 offset:112
-; GCN-MINREG-NEXT:    ds_read_b128 a[24:27], v4 offset:96
-; GCN-MINREG-NEXT:    ds_read_b128 a[20:23], v4 offset:80
-; GCN-MINREG-NEXT:    ds_read_b128 a[16:19], v4 offset:64
-; GCN-MINREG-NEXT:    ds_read_b128 a[0:3], v4
-; GCN-MINREG-NEXT:    ds_read_b128 a[4:7], v4 offset:16
-; GCN-MINREG-NEXT:    ds_read_b128 a[8:11], v4 offset:32
-; GCN-MINREG-NEXT:    ds_read_b128 a[12:15], v4 offset:48
+; GCN-MINREG-NEXT:    v_add_u32_e32 v5, s0, v0
+; GCN-MINREG-NEXT:    ds_read_b128 a[28:31], v5 offset:112
+; GCN-MINREG-NEXT:    ds_read_b128 a[24:27], v5 offset:96
+; GCN-MINREG-NEXT:    ds_read_b128 a[20:23], v5 offset:80
+; GCN-MINREG-NEXT:    ds_read_b128 a[16:19], v5 offset:64
+; GCN-MINREG-NEXT:    ds_read_b128 a[0:3], v5
+; GCN-MINREG-NEXT:    ds_read_b128 a[4:7], v5 offset:16
+; GCN-MINREG-NEXT:    ds_read_b128 a[8:11], v5 offset:32
+; GCN-MINREG-NEXT:    ds_read_b128 a[12:15], v5 offset:48
 ; GCN-MINREG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
-; GCN-MINREG-NEXT:    v_add_u32_e32 v5, s1, v0
-; GCN-MINREG-NEXT:    v_mov_b32_e32 v0, s1
-; GCN-MINREG-NEXT:    v_add_u32_e32 v3, 0x6000, v4
+; GCN-MINREG-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v40, v39, a[0:31]
+; GCN-MINREG-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCN-MINREG-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCN-MINREG-NEXT:    ds_read_b128 v[6:9], v5 offset:8192
+; GCN-MINREG-NEXT:    s_mov_b32 s14, -1
+; GCN-MINREG-NEXT:    s_mov_b32 s15, 0xe00000
+; GCN-MINREG-NEXT:    s_add_u32 s12, s12, s11
+; GCN-MINREG-NEXT:    s_addc_u32 s13, s13, 0
+; GCN-MINREG-NEXT:    ds_read_b128 v[34:37], v5 offset:8304
+; GCN-MINREG-NEXT:    ds_read_b128 v[30:33], v5 offset:8288
+; GCN-MINREG-NEXT:    ds_read_b128 v[26:29], v5 offset:8272
+; GCN-MINREG-NEXT:    ds_read_b128 v[22:25], v5 offset:8256
+; GCN-MINREG-NEXT:    ds_read_b128 v[18:21], v5 offset:8240
+; GCN-MINREG-NEXT:    ds_read_b128 v[14:17], v5 offset:8224
+; GCN-MINREG-NEXT:    ds_read_b128 v[10:13], v5 offset:8208
+; GCN-MINREG-NEXT:    s_waitcnt lgkmcnt(7)
+; GCN-MINREG-NEXT:    buffer_store_dword v6, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    v_add_u32_e32 v4, 0x6000, v5
+; GCN-MINREG-NEXT:    buffer_store_dword v7, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v8, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v9, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    s_waitcnt lgkmcnt(6)
+; GCN-MINREG-NEXT:    buffer_store_dword v37, off, s[12:15], 0 offset:124 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    ds_read_b128 v[6:9], v5 offset:24576
+; GCN-MINREG-NEXT:    s_waitcnt lgkmcnt(1)
+; GCN-MINREG-NEXT:    buffer_store_dword v10, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v11, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v12, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v13, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v14, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v15, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v16, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v17, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v18, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v19, off, s[12:15], 0 offset:52 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v20, off, s[12:15], 0 offset:56 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v21, off, s[12:15], 0 offset:60 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v22, off, s[12:15], 0 offset:64 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v23, off, s[12:15], 0 offset:68 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v24, off, s[12:15], 0 offset:72 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v25, off, s[12:15], 0 offset:76 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v26, off, s[12:15], 0 offset:80 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v27, off, s[12:15], 0 offset:84 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v28, off, s[12:15], 0 offset:88 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v29, off, s[12:15], 0 offset:92 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v30, off, s[12:15], 0 offset:96 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v31, off, s[12:15], 0 offset:100 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v32, off, s[12:15], 0 offset:104 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v33, off, s[12:15], 0 offset:108 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v34, off, s[12:15], 0 offset:112 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v35, off, s[12:15], 0 offset:116 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v36, off, s[12:15], 0 offset:120 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    ds_read_b128 v[34:37], v5 offset:24688
+; GCN-MINREG-NEXT:    ds_read_b128 v[30:33], v5 offset:24672
+; GCN-MINREG-NEXT:    ds_read_b128 v[26:29], v5 offset:24656
+; GCN-MINREG-NEXT:    ds_read_b128 v[22:25], v5 offset:24640
+; GCN-MINREG-NEXT:    ds_read_b128 v[18:21], v5 offset:24624
+; GCN-MINREG-NEXT:    ds_read_b128 v[14:17], v5 offset:24608
+; GCN-MINREG-NEXT:    ds_read_b128 v[10:13], v5 offset:24592
+; GCN-MINREG-NEXT:    ds_read_b128 a[60:63], v5 offset:49264
+; GCN-MINREG-NEXT:    ds_read_b128 a[56:59], v5 offset:49248
+; GCN-MINREG-NEXT:    ds_read_b128 a[52:55], v5 offset:49232
+; GCN-MINREG-NEXT:    ds_read_b128 a[48:51], v5 offset:49216
+; GCN-MINREG-NEXT:    ds_read_b128 a[44:47], v5 offset:49200
+; GCN-MINREG-NEXT:    ds_read_b128 a[40:43], v5 offset:49184
+; GCN-MINREG-NEXT:    ds_read_b128 a[36:39], v5 offset:49168
+; GCN-MINREG-NEXT:    ds_read_b128 a[32:35], v5 offset:49152
+; GCN-MINREG-NEXT:    buffer_store_dword a0, off, s[12:15], 0 offset:128 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    v_add_u32_e32 v41, s1, v0
+; GCN-MINREG-NEXT:    s_waitcnt lgkmcnt(14)
+; GCN-MINREG-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:256 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v63, a10 ; Reload Reuse
+; GCN-MINREG-NEXT:    buffer_store_dword a1, off, s[12:15], 0 offset:132 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword a2, off, s[12:15], 0 offset:136 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword a3, off, s[12:15], 0 offset:140 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword a4, off, s[12:15], 0 offset:144 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword a5, off, s[12:15], 0 offset:148 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword a6, off, s[12:15], 0 offset:152 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword a7, off, s[12:15], 0 offset:156 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword a8, off, s[12:15], 0 offset:160 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword a9, off, s[12:15], 0 offset:164 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v62, a11 ; Reload Reuse
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v61, a12 ; Reload Reuse
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v60, a13 ; Reload Reuse
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v59, a14 ; Reload Reuse
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v58, a15 ; Reload Reuse
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v57, a16 ; Reload Reuse
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v56, a17 ; Reload Reuse
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v55, a18 ; Reload Reuse
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v54, a19 ; Reload Reuse
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v53, a20 ; Reload Reuse
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v52, a21 ; Reload Reuse
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v51, a22 ; Reload Reuse
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v50, a23 ; Reload Reuse
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v49, a24 ; Reload Reuse
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v48, a25 ; Reload Reuse
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v47, a26 ; Reload Reuse
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v46, a27 ; Reload Reuse
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v45, a28 ; Reload Reuse
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v44, a29 ; Reload Reuse
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v43, a30 ; Reload Reuse
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v42, a31 ; Reload Reuse
+; GCN-MINREG-NEXT:    buffer_store_dword v7, off, s[12:15], 0 offset:260 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v8, off, s[12:15], 0 offset:264 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v9, off, s[12:15], 0 offset:268 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    s_waitcnt lgkmcnt(8)
+; GCN-MINREG-NEXT:    buffer_store_dword v10, off, s[12:15], 0 offset:272 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v11, off, s[12:15], 0 offset:276 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v12, off, s[12:15], 0 offset:280 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v13, off, s[12:15], 0 offset:284 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v14, off, s[12:15], 0 offset:288 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v15, off, s[12:15], 0 offset:292 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v16, off, s[12:15], 0 offset:296 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v17, off, s[12:15], 0 offset:300 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v18, off, s[12:15], 0 offset:304 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v19, off, s[12:15], 0 offset:308 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v20, off, s[12:15], 0 offset:312 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v21, off, s[12:15], 0 offset:316 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v22, off, s[12:15], 0 offset:320 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v23, off, s[12:15], 0 offset:324 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v24, off, s[12:15], 0 offset:328 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v25, off, s[12:15], 0 offset:332 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v26, off, s[12:15], 0 offset:336 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v27, off, s[12:15], 0 offset:340 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v28, off, s[12:15], 0 offset:344 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v29, off, s[12:15], 0 offset:348 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v30, off, s[12:15], 0 offset:352 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v31, off, s[12:15], 0 offset:356 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v37, off, s[12:15], 0 offset:380 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-MINREG-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v40, v39, a[32:63]
+; GCN-MINREG-NEXT:    ds_read_b128 a[28:31], v4 offset:57456
+; GCN-MINREG-NEXT:    ds_read_b128 a[24:27], v4 offset:57440
+; GCN-MINREG-NEXT:    ds_read_b128 a[20:23], v4 offset:57424
+; GCN-MINREG-NEXT:    ds_read_b128 a[16:19], v4 offset:57408
+; GCN-MINREG-NEXT:    ds_read_b128 a[0:3], v4 offset:57344
+; GCN-MINREG-NEXT:    ds_read_b128 a[4:7], v4 offset:57360
+; GCN-MINREG-NEXT:    ds_read_b128 a[8:11], v4 offset:57376
+; GCN-MINREG-NEXT:    ds_read_b128 a[12:15], v4 offset:57392
+; GCN-MINREG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:128 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:132 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:136 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:140 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword v4, off, s[12:15], 0 offset:144 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword v5, off, s[12:15], 0 offset:148 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword v6, off, s[12:15], 0 offset:152 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword v7, off, s[12:15], 0 offset:156 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword v8, off, s[12:15], 0 offset:160 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword v9, off, s[12:15], 0 offset:164 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v38, s1
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v10, v63
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v11, v62
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v12, v61
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v13, v60
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v14, v59
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v15, v58
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v16, v57
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v17, v56
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v18, v55
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v19, v54
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v20, v53
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v21, v52
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v22, v51
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v23, v50
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v24, v49
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v25, v48
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v26, v47
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v27, v46
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v28, v45
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v29, v44
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v30, v43
+; GCN-MINREG-NEXT:    s_waitcnt vmcnt(0)
+; GCN-MINREG-NEXT:    v_mov_b32_e32 v31, v42
+; GCN-MINREG-NEXT:    ds_write_b128 v41, v[28:31] offset:112
+; GCN-MINREG-NEXT:    ds_write_b128 v41, v[24:27] offset:96
+; GCN-MINREG-NEXT:    ds_write_b128 v41, v[20:23] offset:80
+; GCN-MINREG-NEXT:    ds_write_b128 v41, v[16:19] offset:64
+; GCN-MINREG-NEXT:    ds_write_b128 v41, v[12:15] offset:48
+; GCN-MINREG-NEXT:    buffer_store_dword v32, off, s[12:15], 0 offset:360 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v33, off, s[12:15], 0 offset:364 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v34, off, s[12:15], 0 offset:368 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v35, off, s[12:15], 0 offset:372 ; 4-byte Folded Spill
+; GCN-MINREG-NEXT:    buffer_store_dword v36, off, s[12:15], 0 offset:376 ; 4-byte Folded Spill
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    ds_write_b128 v5, a[28:31] offset:112
-; GCN-MINREG-NEXT:    ds_write_b128 v5, a[24:27] offset:96
-; GCN-MINREG-NEXT:    ds_write_b128 v5, a[20:23] offset:80
-; GCN-MINREG-NEXT:    ds_write_b128 v5, a[16:19] offset:64
-; GCN-MINREG-NEXT:    ds_write_b128 v5, a[12:15] offset:48
-; GCN-MINREG-NEXT:    ds_write_b128 v5, a[8:11] offset:32
-; GCN-MINREG-NEXT:    ds_write_b128 v5, a[4:7] offset:16
-; GCN-MINREG-NEXT:    ds_write_b128 v5, a[0:3]
-; GCN-MINREG-NEXT:    ds_read_b128 a[28:31], v4 offset:8304
-; GCN-MINREG-NEXT:    ds_read_b128 a[24:27], v4 offset:8288
-; GCN-MINREG-NEXT:    ds_read_b128 a[20:23], v4 offset:8272
-; GCN-MINREG-NEXT:    ds_read_b128 a[16:19], v4 offset:8256
-; GCN-MINREG-NEXT:    ds_read_b128 a[12:15], v4 offset:8240
-; GCN-MINREG-NEXT:    ds_read_b128 a[8:11], v4 offset:8224
-; GCN-MINREG-NEXT:    ds_read_b128 a[4:7], v4 offset:8208
-; GCN-MINREG-NEXT:    ds_read_b128 a[0:3], v4 offset:8192
-; GCN-MINREG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 2
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[24:27] offset:8288
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[28:31] offset:8304
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[16:19] offset:8256
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[20:23] offset:8272
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[8:11] offset:8224
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[12:15] offset:8240
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[0:3] offset:8192
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[4:7] offset:8208
-; GCN-MINREG-NEXT:    ds_read_b128 a[28:31], v4 offset:24688
-; GCN-MINREG-NEXT:    ds_read_b128 a[24:27], v4 offset:24672
-; GCN-MINREG-NEXT:    ds_read_b128 a[20:23], v4 offset:24656
-; GCN-MINREG-NEXT:    ds_read_b128 a[16:19], v4 offset:24640
-; GCN-MINREG-NEXT:    ds_read_b128 a[12:15], v4 offset:24624
-; GCN-MINREG-NEXT:    ds_read_b128 a[8:11], v4 offset:24608
-; GCN-MINREG-NEXT:    ds_read_b128 a[4:7], v4 offset:24592
-; GCN-MINREG-NEXT:    ds_read_b128 a[0:3], v4 offset:24576
-; GCN-MINREG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 2
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[24:27] offset:16480
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[28:31] offset:16496
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[16:19] offset:16448
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[20:23] offset:16464
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[8:11] offset:16416
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[12:15] offset:16432
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[0:3] offset:16384
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[4:7] offset:16400
-; GCN-MINREG-NEXT:    ds_read_b128 a[28:31], v4 offset:49264
-; GCN-MINREG-NEXT:    ds_read_b128 a[24:27], v4 offset:49248
-; GCN-MINREG-NEXT:    ds_read_b128 a[20:23], v4 offset:49232
-; GCN-MINREG-NEXT:    ds_read_b128 a[16:19], v4 offset:49216
-; GCN-MINREG-NEXT:    ds_read_b128 a[12:15], v4 offset:49200
-; GCN-MINREG-NEXT:    ds_read_b128 a[8:11], v4 offset:49184
-; GCN-MINREG-NEXT:    ds_read_b128 a[4:7], v4 offset:49168
-; GCN-MINREG-NEXT:    ds_read_b128 a[0:3], v4 offset:49152
-; GCN-MINREG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
+; GCN-MINREG-NEXT:    ds_write_b128 v41, v[0:3]
+; GCN-MINREG-NEXT:    ds_write_b128 v41, v[4:7] offset:16
+; GCN-MINREG-NEXT:    ds_write_b128 v41, v[8:11] offset:32
+; GCN-MINREG-NEXT:    ds_write_b128 v38, a[56:59] offset:24672
+; GCN-MINREG-NEXT:    ds_write_b128 v38, a[60:63] offset:24688
+; GCN-MINREG-NEXT:    ds_write_b128 v38, a[48:51] offset:24640
+; GCN-MINREG-NEXT:    ds_write_b128 v38, a[52:55] offset:24656
+; GCN-MINREG-NEXT:    ds_write_b128 v38, a[40:43] offset:24608
+; GCN-MINREG-NEXT:    ds_write_b128 v38, a[44:47] offset:24624
+; GCN-MINREG-NEXT:    ds_write_b128 v38, a[32:35] offset:24576
+; GCN-MINREG-NEXT:    ds_write_b128 v38, a[36:39] offset:24592
+; GCN-MINREG-NEXT:    s_waitcnt lgkmcnt(14)
+; GCN-MINREG-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v40, v39, a[0:31]
+; GCN-MINREG-NEXT:    buffer_load_dword a32, off, s[12:15], 0 offset:256 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a33, off, s[12:15], 0 offset:260 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a34, off, s[12:15], 0 offset:264 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a35, off, s[12:15], 0 offset:268 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a36, off, s[12:15], 0 offset:272 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a37, off, s[12:15], 0 offset:276 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a38, off, s[12:15], 0 offset:280 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a39, off, s[12:15], 0 offset:284 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a40, off, s[12:15], 0 offset:288 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a41, off, s[12:15], 0 offset:292 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a42, off, s[12:15], 0 offset:296 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a43, off, s[12:15], 0 offset:300 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a44, off, s[12:15], 0 offset:304 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a45, off, s[12:15], 0 offset:308 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a46, off, s[12:15], 0 offset:312 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a47, off, s[12:15], 0 offset:316 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a48, off, s[12:15], 0 offset:320 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a49, off, s[12:15], 0 offset:324 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a50, off, s[12:15], 0 offset:328 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    ds_write_b128 v38, a[24:27] offset:32864
+; GCN-MINREG-NEXT:    ds_write_b128 v38, a[28:31] offset:32880
+; GCN-MINREG-NEXT:    ds_write_b128 v38, a[16:19] offset:32832
+; GCN-MINREG-NEXT:    ds_write_b128 v38, a[20:23] offset:32848
+; GCN-MINREG-NEXT:    ds_write_b128 v38, a[8:11] offset:32800
+; GCN-MINREG-NEXT:    ds_write_b128 v38, a[12:15] offset:32816
+; GCN-MINREG-NEXT:    ds_write_b128 v38, a[0:3] offset:32768
+; GCN-MINREG-NEXT:    ds_write_b128 v38, a[4:7] offset:32784
+; GCN-MINREG-NEXT:    buffer_load_dword a0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a4, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a5, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a6, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a7, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a8, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a9, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a10, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a11, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a12, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a13, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a14, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a15, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a16, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a17, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a18, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a19, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a20, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a21, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a22, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a23, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a24, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a25, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a26, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a27, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a28, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a29, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a30, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a31, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a51, off, s[12:15], 0 offset:332 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a52, off, s[12:15], 0 offset:336 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a53, off, s[12:15], 0 offset:340 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a54, off, s[12:15], 0 offset:344 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a55, off, s[12:15], 0 offset:348 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a56, off, s[12:15], 0 offset:352 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a57, off, s[12:15], 0 offset:356 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a58, off, s[12:15], 0 offset:360 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a59, off, s[12:15], 0 offset:364 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a60, off, s[12:15], 0 offset:368 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a61, off, s[12:15], 0 offset:372 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a62, off, s[12:15], 0 offset:376 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    buffer_load_dword a63, off, s[12:15], 0 offset:380 ; 4-byte Folded Reload
+; GCN-MINREG-NEXT:    s_waitcnt vmcnt(13)
+; GCN-MINREG-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v40, v39, a[0:31]
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 2
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[24:27] offset:24672
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[28:31] offset:24688
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[16:19] offset:24640
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[20:23] offset:24656
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[8:11] offset:24608
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[12:15] offset:24624
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[0:3] offset:24576
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[4:7] offset:24592
-; GCN-MINREG-NEXT:    ds_read_b128 a[28:31], v3 offset:57456
-; GCN-MINREG-NEXT:    ds_read_b128 a[24:27], v3 offset:57440
-; GCN-MINREG-NEXT:    ds_read_b128 a[20:23], v3 offset:57424
-; GCN-MINREG-NEXT:    ds_read_b128 a[16:19], v3 offset:57408
-; GCN-MINREG-NEXT:    ds_read_b128 a[0:3], v3 offset:57344
-; GCN-MINREG-NEXT:    ds_read_b128 a[4:7], v3 offset:57360
-; GCN-MINREG-NEXT:    ds_read_b128 a[8:11], v3 offset:57376
-; GCN-MINREG-NEXT:    ds_read_b128 a[12:15], v3 offset:57392
-; GCN-MINREG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    s_nop 7
 ; GCN-MINREG-NEXT:    s_nop 7
 ; GCN-MINREG-NEXT:    s_nop 2
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[24:27] offset:32864
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[28:31] offset:32880
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[16:19] offset:32832
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[20:23] offset:32848
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[8:11] offset:32800
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[12:15] offset:32816
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[0:3] offset:32768
-; GCN-MINREG-NEXT:    ds_write_b128 v0, a[4:7] offset:32784
-; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v33, a31
+; GCN-MINREG-NEXT:    s_waitcnt vmcnt(0)
+; GCN-MINREG-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v40, v39, a[32:63]
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v32, a30
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v31, a29
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v30, a28
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v29, a27
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v28, a26
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v27, a25
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v26, a24
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v25, a23
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v24, a22
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v23, a21
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v22, a20
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v21, a19
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v20, a18
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v19, a17
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v18, a16
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v17, a15
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v16, a14
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v15, a13
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v14, a12
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v13, a11
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v12, a10
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v11, a9
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v10, a8
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v9, a7
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v8, a6
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v7, a5
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v6, a4
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v5, a3
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v4, a2
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v3, a1
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v2, a0
+; GCN-MINREG-NEXT:    ds_write_b128 v38, a[56:59] offset:16480
+; GCN-MINREG-NEXT:    ds_write_b128 v38, a[60:63] offset:16496
+; GCN-MINREG-NEXT:    ds_write_b128 v38, a[48:51] offset:16448
+; GCN-MINREG-NEXT:    ds_write_b128 v38, a[52:55] offset:16464
+; GCN-MINREG-NEXT:    ds_write_b128 v38, a[40:43] offset:16416
+; GCN-MINREG-NEXT:    ds_write_b128 v38, a[44:47] offset:16432
+; GCN-MINREG-NEXT:    ds_write_b128 v38, a[32:35] offset:16384
+; GCN-MINREG-NEXT:    ds_write_b128 v38, a[36:39] offset:16400
+; GCN-MINREG-NEXT:    ds_write_b128 v38, v[26:29] offset:8288
+; GCN-MINREG-NEXT:    ds_write_b128 v38, v[30:33] offset:8304
+; GCN-MINREG-NEXT:    ds_write_b128 v38, v[18:21] offset:8256
+; GCN-MINREG-NEXT:    ds_write_b128 v38, v[22:25] offset:8272
+; GCN-MINREG-NEXT:    ds_write_b128 v38, v[10:13] offset:8224
+; GCN-MINREG-NEXT:    ds_write_b128 v38, v[14:17] offset:8240
+; GCN-MINREG-NEXT:    ds_write_b128 v38, v[2:5] offset:8192
+; GCN-MINREG-NEXT:    ds_write_b128 v38, v[6:9] offset:8208
 ; GCN-MINREG-NEXT:    s_endpgm
 ;
 ; GCN-MAXOCC-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave:
 ; GCN-MAXOCC:       ; %bb.0: ; %entry
 ; GCN-MAXOCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GCN-MAXOCC-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GCN-MAXOCC-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v2, 1.0
-; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v3, 2.0
+; GCN-MAXOCC-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GCN-MAXOCC-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCN-MAXOCC-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN-MAXOCC-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT:    v_add_u32_e32 v0, s0, v1
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[28:31], v0 offset:112
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[24:27], v0 offset:96
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[20:23], v0 offset:80
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[16:19], v0 offset:64
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[0:3], v0
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[4:7], v0 offset:16
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[8:11], v0 offset:32
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[12:15], v0 offset:48
+; GCN-MAXOCC-NEXT:    v_add_u32_e32 v1, s0, v0
+; GCN-MAXOCC-NEXT:    ds_read_b128 v[2:5], v1
+; GCN-MAXOCC-NEXT:    ds_read_b128 v[30:33], v1 offset:112
+; GCN-MAXOCC-NEXT:    ds_read_b128 v[26:29], v1 offset:96
+; GCN-MAXOCC-NEXT:    ds_read_b128 v[22:25], v1 offset:80
+; GCN-MAXOCC-NEXT:    ds_read_b128 v[18:21], v1 offset:64
+; GCN-MAXOCC-NEXT:    ds_read_b128 v[6:9], v1 offset:16
+; GCN-MAXOCC-NEXT:    ds_read_b128 v[10:13], v1 offset:32
+; GCN-MAXOCC-NEXT:    ds_read_b128 v[14:17], v1 offset:48
 ; GCN-MAXOCC-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a0, v2
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a1, v3
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a2, v4
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a3, v5
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a4, v6
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a5, v7
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a6, v8
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a7, v9
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a8, v10
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a9, v11
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a10, v12
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a11, v13
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a12, v14
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a13, v15
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a14, v16
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a15, v17
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a16, v18
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a17, v19
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a18, v20
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a19, v21
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a20, v22
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a21, v23
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a22, v24
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a23, v25
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a24, v26
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a25, v27
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a26, v28
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a27, v29
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a28, v30
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a29, v31
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a30, v32
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a31, v33
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v2, 1.0
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v3, 2.0
+; GCN-MAXOCC-NEXT:    s_mov_b32 s14, -1
+; GCN-MAXOCC-NEXT:    s_mov_b32 s15, 0xe00000
 ; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-MAXOCC-NEXT:    v_add_u32_e32 v1, s1, v1
+; GCN-MAXOCC-NEXT:    s_add_u32 s12, s12, s11
+; GCN-MAXOCC-NEXT:    s_addc_u32 s13, s13, 0
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v4, v3
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v3, v2
+; GCN-MAXOCC-NEXT:    v_add_u32_e32 v2, 0x6000, v1
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 1
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[28:31] offset:112
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[24:27] offset:96
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[20:23] offset:80
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[16:19] offset:64
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[12:15] offset:48
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[8:11] offset:32
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[4:7] offset:16
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[0:3]
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[28:31], v0 offset:8304
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[24:27], v0 offset:8288
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[20:23], v0 offset:8272
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[16:19], v0 offset:8256
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[12:15], v0 offset:8240
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[8:11], v0 offset:8224
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[4:7], v0 offset:8208
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[0:3], v0 offset:8192
+; GCN-MAXOCC-NEXT:    s_nop 5
+; GCN-MAXOCC-NEXT:    buffer_store_dword a0, off, s[12:15], 0 offset:128 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v63, a4 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    buffer_store_dword a1, off, s[12:15], 0 offset:132 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a2, off, s[12:15], 0 offset:136 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a3, off, s[12:15], 0 offset:140 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v62, a5 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v61, a6 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v60, a7 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v59, a8 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v58, a9 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v57, a10 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v56, a11 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v55, a12 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v54, a13 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v53, a14 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v52, a15 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v51, a16 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v50, a17 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v49, a18 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v48, a19 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v47, a20 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v46, a21 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v45, a22 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v44, a23 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v43, a24 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v42, a25 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v41, a26 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v40, a27 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v39, a28 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v38, a29 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v37, a30 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v36, a31 ; Reload Reuse
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[28:31], v1 offset:8304
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[24:27], v1 offset:8288
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[20:23], v1 offset:8272
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[16:19], v1 offset:8256
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[12:15], v1 offset:8240
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[8:11], v1 offset:8224
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[4:7], v1 offset:8208
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[0:3], v1 offset:8192
 ; GCN-MAXOCC-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
-; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
-; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v4, a[0:31]
 ; GCN-MAXOCC-NEXT:    s_nop 7
 ; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 1
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[24:27] offset:8288
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[28:31] offset:8304
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[16:19] offset:8256
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[20:23] offset:8272
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[8:11] offset:8224
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[12:15] offset:8240
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[0:3] offset:8192
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[4:7] offset:8208
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[28:31], v0 offset:24688
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[24:27], v0 offset:24672
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[20:23], v0 offset:24656
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[16:19], v0 offset:24640
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[12:15], v0 offset:24624
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[8:11], v0 offset:24608
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[4:7], v0 offset:24592
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[0:3], v0 offset:24576
+; GCN-MAXOCC-NEXT:    s_nop 2
+; GCN-MAXOCC-NEXT:    buffer_store_dword a0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    s_nop 0
+; GCN-MAXOCC-NEXT:    buffer_store_dword a1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a4, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a5, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a6, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a7, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a8, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a9, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a10, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a11, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a12, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a13, off, s[12:15], 0 offset:52 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a14, off, s[12:15], 0 offset:56 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a15, off, s[12:15], 0 offset:60 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a16, off, s[12:15], 0 offset:64 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a17, off, s[12:15], 0 offset:68 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a18, off, s[12:15], 0 offset:72 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a19, off, s[12:15], 0 offset:76 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a20, off, s[12:15], 0 offset:80 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a21, off, s[12:15], 0 offset:84 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a22, off, s[12:15], 0 offset:88 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a23, off, s[12:15], 0 offset:92 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a24, off, s[12:15], 0 offset:96 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a25, off, s[12:15], 0 offset:100 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a26, off, s[12:15], 0 offset:104 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a27, off, s[12:15], 0 offset:108 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a28, off, s[12:15], 0 offset:112 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a29, off, s[12:15], 0 offset:116 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a30, off, s[12:15], 0 offset:120 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword a31, off, s[12:15], 0 offset:124 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[28:31], v1 offset:24688
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[24:27], v1 offset:24672
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[20:23], v1 offset:24656
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[16:19], v1 offset:24640
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[12:15], v1 offset:24624
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[8:11], v1 offset:24608
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[4:7], v1 offset:24592
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[0:3], v1 offset:24576
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[60:63], v1 offset:49264
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[56:59], v1 offset:49248
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[52:55], v1 offset:49232
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[48:51], v1 offset:49216
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[44:47], v1 offset:49200
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[40:43], v1 offset:49184
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[36:39], v1 offset:49168
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[32:35], v1 offset:49152
+; GCN-MAXOCC-NEXT:    ds_read_b128 v[4:7], v2 offset:57344
+; GCN-MAXOCC-NEXT:    ds_read_b128 v[32:35], v2 offset:57456
+; GCN-MAXOCC-NEXT:    ds_read_b128 v[28:31], v2 offset:57440
+; GCN-MAXOCC-NEXT:    ds_read_b128 v[24:27], v2 offset:57424
+; GCN-MAXOCC-NEXT:    ds_read_b128 v[20:23], v2 offset:57408
+; GCN-MAXOCC-NEXT:    ds_read_b128 v[8:11], v2 offset:57360
+; GCN-MAXOCC-NEXT:    ds_read_b128 v[12:15], v2 offset:57376
+; GCN-MAXOCC-NEXT:    ds_read_b128 v[16:19], v2 offset:57392
+; GCN-MAXOCC-NEXT:    s_waitcnt lgkmcnt(7)
+; GCN-MAXOCC-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:256 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    s_nop 0
+; GCN-MAXOCC-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:260 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:264 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v7, off, s[12:15], 0 offset:268 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    s_waitcnt lgkmcnt(2)
+; GCN-MAXOCC-NEXT:    buffer_store_dword v8, off, s[12:15], 0 offset:272 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v9, off, s[12:15], 0 offset:276 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v10, off, s[12:15], 0 offset:280 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v11, off, s[12:15], 0 offset:284 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    s_waitcnt lgkmcnt(1)
+; GCN-MAXOCC-NEXT:    buffer_store_dword v12, off, s[12:15], 0 offset:288 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v13, off, s[12:15], 0 offset:292 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v14, off, s[12:15], 0 offset:296 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v15, off, s[12:15], 0 offset:300 ; 4-byte Folded Spill
 ; GCN-MAXOCC-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GCN-MAXOCC-NEXT:    buffer_store_dword v16, off, s[12:15], 0 offset:304 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v17, off, s[12:15], 0 offset:308 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v18, off, s[12:15], 0 offset:312 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v19, off, s[12:15], 0 offset:316 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v20, off, s[12:15], 0 offset:320 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v21, off, s[12:15], 0 offset:324 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v22, off, s[12:15], 0 offset:328 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v23, off, s[12:15], 0 offset:332 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v24, off, s[12:15], 0 offset:336 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v25, off, s[12:15], 0 offset:340 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v26, off, s[12:15], 0 offset:344 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v27, off, s[12:15], 0 offset:348 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v28, off, s[12:15], 0 offset:352 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v29, off, s[12:15], 0 offset:356 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v30, off, s[12:15], 0 offset:360 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v31, off, s[12:15], 0 offset:364 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v32, off, s[12:15], 0 offset:368 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    buffer_store_dword v35, off, s[12:15], 0 offset:380 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    v_add_u32_e32 v32, s1, v0
+; GCN-MAXOCC-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:128 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:132 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:136 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:140 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v4, v63
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v5, v62
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v6, v61
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v7, v60
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v8, v59
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v9, v58
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v10, v57
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v11, v56
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v12, v55
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v13, v54
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v14, v53
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v15, v52
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v16, v51
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v17, v50
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v18, v49
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v19, v48
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v20, v47
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v21, v46
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v22, v45
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v23, v44
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v24, v43
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v25, v42
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v26, v41
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v27, v40
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v28, v39
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v29, v38
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v30, v37
+; GCN-MAXOCC-NEXT:    s_waitcnt vmcnt(0)
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v31, v36
+; GCN-MAXOCC-NEXT:    buffer_store_dword v33, off, s[12:15], 0 offset:372 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    ds_write_b128 v32, v[28:31] offset:112
+; GCN-MAXOCC-NEXT:    ds_write_b128 v32, v[24:27] offset:96
+; GCN-MAXOCC-NEXT:    ds_write_b128 v32, v[20:23] offset:80
+; GCN-MAXOCC-NEXT:    ds_write_b128 v32, v[16:19] offset:64
+; GCN-MAXOCC-NEXT:    ds_write_b128 v32, v[12:15] offset:48
+; GCN-MAXOCC-NEXT:    ds_write_b128 v32, v[8:11] offset:32
+; GCN-MAXOCC-NEXT:    ds_write_b128 v32, v[4:7] offset:16
+; GCN-MAXOCC-NEXT:    buffer_store_dword v34, off, s[12:15], 0 offset:376 ; 4-byte Folded Spill
+; GCN-MAXOCC-NEXT:    ds_write_b128 v32, v[0:3]
+; GCN-MAXOCC-NEXT:    buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v26, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v27, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v28, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v29, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-MAXOCC-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v5, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v6, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v7, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v8, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v9, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v10, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v11, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v12, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v13, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v18, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v19, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v20, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v21, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v22, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v23, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v24, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v25, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v30, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v31, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v32, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v33, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v1, 1.0
+; GCN-MAXOCC-NEXT:    s_waitcnt vmcnt(27)
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, v[26:29] offset:8288
+; GCN-MAXOCC-NEXT:    buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v30, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v31, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v32, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v33, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v5, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v6, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v7, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v8, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v9, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v10, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v11, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v12, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v13, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v18, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v19, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v20, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v21, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v22, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v23, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v24, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v25, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v26, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v27, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v28, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v29, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    s_waitcnt vmcnt(27)
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, v[30:33] offset:8304
+; GCN-MAXOCC-NEXT:    buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v18, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v19, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v20, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v21, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v5, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v6, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v7, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v8, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v9, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v10, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v11, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v12, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v13, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v22, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v23, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v24, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v25, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v26, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v27, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v28, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v29, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v30, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v31, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v32, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v33, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    s_waitcnt vmcnt(27)
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, v[18:21] offset:8256
+; GCN-MAXOCC-NEXT:    buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v22, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v23, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v24, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v25, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v5, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v6, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v7, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v8, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v9, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v10, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v11, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v12, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v13, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v18, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v19, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v20, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v21, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v26, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v27, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v28, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v29, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v30, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v31, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v32, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v33, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    s_waitcnt vmcnt(27)
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, v[22:25] offset:8272
+; GCN-MAXOCC-NEXT:    buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v10, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v11, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v12, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v13, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v5, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v6, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v7, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v8, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v9, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v18, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v19, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v20, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v21, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v22, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v23, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v24, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v25, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v26, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v27, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v28, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v29, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v30, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v31, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v32, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v33, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    s_waitcnt vmcnt(27)
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, v[10:13] offset:8224
+; GCN-MAXOCC-NEXT:    buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v5, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v6, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v7, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v8, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v9, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v10, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v11, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v12, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v13, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v18, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v19, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v20, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v21, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v22, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v23, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v24, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v25, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v26, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v27, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v28, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v29, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v30, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v31, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v32, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v33, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    s_waitcnt vmcnt(27)
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, v[14:17] offset:8240
+; GCN-MAXOCC-NEXT:    buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v5, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v6, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v7, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v8, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v9, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v10, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v11, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v12, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v13, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v18, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v19, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v20, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v21, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v22, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v23, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v24, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v25, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v26, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v27, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v28, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v29, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v30, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v31, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v32, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v33, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    s_waitcnt vmcnt(28)
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, v[2:5] offset:8192
+; GCN-MAXOCC-NEXT:    buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    s_waitcnt vmcnt(0)
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v2, 2.0
+; GCN-MAXOCC-NEXT:    buffer_load_dword v6, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v7, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; GCN-MAXOCC-NEXT:    buffer_load_dword v8, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v9, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v5, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v10, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v11, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v12, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v13, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v18, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v19, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v20, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v21, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v22, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v23, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, a[24:27] offset:16480
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, a[28:31] offset:16496
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, a[16:19] offset:16448
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, a[20:23] offset:16464
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, a[8:11] offset:16416
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, a[12:15] offset:16432
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, a[0:3] offset:16384
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, a[4:7] offset:16400
+; GCN-MAXOCC-NEXT:    buffer_load_dword a0, off, s[12:15], 0 offset:256 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a1, off, s[12:15], 0 offset:260 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a2, off, s[12:15], 0 offset:264 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a3, off, s[12:15], 0 offset:268 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a4, off, s[12:15], 0 offset:272 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a5, off, s[12:15], 0 offset:276 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a6, off, s[12:15], 0 offset:280 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a7, off, s[12:15], 0 offset:284 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a8, off, s[12:15], 0 offset:288 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a9, off, s[12:15], 0 offset:292 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a10, off, s[12:15], 0 offset:296 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a11, off, s[12:15], 0 offset:300 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a12, off, s[12:15], 0 offset:304 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a13, off, s[12:15], 0 offset:308 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a14, off, s[12:15], 0 offset:312 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a15, off, s[12:15], 0 offset:316 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a16, off, s[12:15], 0 offset:320 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a17, off, s[12:15], 0 offset:324 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a18, off, s[12:15], 0 offset:328 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a19, off, s[12:15], 0 offset:332 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a20, off, s[12:15], 0 offset:336 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a21, off, s[12:15], 0 offset:340 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a22, off, s[12:15], 0 offset:344 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a23, off, s[12:15], 0 offset:348 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a24, off, s[12:15], 0 offset:352 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a25, off, s[12:15], 0 offset:356 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a26, off, s[12:15], 0 offset:360 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a27, off, s[12:15], 0 offset:364 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a28, off, s[12:15], 0 offset:368 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a29, off, s[12:15], 0 offset:372 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a30, off, s[12:15], 0 offset:376 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword a31, off, s[12:15], 0 offset:380 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v24, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v25, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v26, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v27, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v28, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v29, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v30, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v31, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v32, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    buffer_load_dword v33, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload
+; GCN-MAXOCC-NEXT:    s_waitcnt vmcnt(10)
+; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; GCN-MAXOCC-NEXT:    s_nop 7
+; GCN-MAXOCC-NEXT:    s_nop 7
+; GCN-MAXOCC-NEXT:    s_nop 2
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, a[24:27] offset:32864
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, a[28:31] offset:32880
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, a[16:19] offset:32832
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, a[20:23] offset:32848
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, a[8:11] offset:32800
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, a[12:15] offset:32816
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, a[0:3] offset:32768
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, a[4:7] offset:32784
+; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-MAXOCC-NEXT:    s_nop 7
 ; GCN-MAXOCC-NEXT:    s_nop 7
 ; GCN-MAXOCC-NEXT:    s_nop 2
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[24:27] offset:16480
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[28:31] offset:16496
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[16:19] offset:16448
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[20:23] offset:16464
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[8:11] offset:16416
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[12:15] offset:16432
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[0:3] offset:16384
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[4:7] offset:16400
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[28:31], v0 offset:49264
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[24:27], v0 offset:49248
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[20:23], v0 offset:49232
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[16:19], v0 offset:49216
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[12:15], v0 offset:49200
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[8:11], v0 offset:49184
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[4:7], v0 offset:49168
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[0:3], v0 offset:49152
-; GCN-MAXOCC-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-MAXOCC-NEXT:    v_add_u32_e32 v0, 0x6000, v0
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, a[56:59] offset:24672
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, a[60:63] offset:24688
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, a[48:51] offset:24640
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, a[52:55] offset:24656
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, a[40:43] offset:24608
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, a[44:47] offset:24624
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, a[32:35] offset:24576
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, a[36:39] offset:24592
+; GCN-MAXOCC-NEXT:    ds_write_b128 v0, v[6:9] offset:8208
+; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
+; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
+; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 1
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[24:27] offset:24672
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[28:31] offset:24688
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[16:19] offset:24640
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[20:23] offset:24656
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[8:11] offset:24608
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[12:15] offset:24624
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[0:3] offset:24576
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[4:7] offset:24592
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[28:31], v0 offset:57456
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[24:27], v0 offset:57440
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[20:23], v0 offset:57424
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[16:19], v0 offset:57408
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[0:3], v0 offset:57344
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[4:7], v0 offset:57360
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[8:11], v0 offset:57376
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[12:15], v0 offset:57392
-; GCN-MAXOCC-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 2
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[24:27] offset:32864
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[28:31] offset:32880
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[16:19] offset:32832
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[20:23] offset:32848
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[8:11] offset:32800
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[12:15] offset:32816
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[0:3] offset:32768
-; GCN-MAXOCC-NEXT:    ds_write_b128 v1, a[4:7] offset:32784
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    s_endpgm
 ;
@@ -275,11 +967,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-ILP:       ; %bb.0: ; %entry
 ; GCN-ILP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GCN-ILP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GCN-ILP-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GCN-ILP-NEXT:    v_mov_b32_e32 v1, 1.0
-; GCN-ILP-NEXT:    v_mov_b32_e32 v2, 2.0
+; GCN-ILP-NEXT:    v_lshlrev_b32_e32 v2, 7, v0
+; GCN-ILP-NEXT:    v_mov_b32_e32 v0, 1.0
+; GCN-ILP-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT:    v_add_u32_e32 v3, s0, v0
+; GCN-ILP-NEXT:    v_add_u32_e32 v3, s0, v2
 ; GCN-ILP-NEXT:    ds_read_b128 a[12:15], v3 offset:48
 ; GCN-ILP-NEXT:    ds_read_b128 a[8:11], v3 offset:32
 ; GCN-ILP-NEXT:    ds_read_b128 a[4:7], v3 offset:16
@@ -289,119 +981,355 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-ILP-NEXT:    ds_read_b128 a[24:27], v3 offset:96
 ; GCN-ILP-NEXT:    ds_read_b128 a[28:31], v3 offset:112
 ; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-ILP-NEXT:    v_add_u32_e32 v0, s1, v0
+; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GCN-ILP-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCN-ILP-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCN-ILP-NEXT:    ds_read_b128 v[4:7], v3 offset:8192
+; GCN-ILP-NEXT:    s_mov_b32 s14, -1
+; GCN-ILP-NEXT:    s_mov_b32 s15, 0xe00000
+; GCN-ILP-NEXT:    s_add_u32 s12, s12, s11
+; GCN-ILP-NEXT:    s_addc_u32 s13, s13, 0
+; GCN-ILP-NEXT:    ds_read_b128 v[32:35], v3 offset:8304
+; GCN-ILP-NEXT:    ds_read_b128 v[28:31], v3 offset:8288
+; GCN-ILP-NEXT:    ds_read_b128 v[24:27], v3 offset:8272
+; GCN-ILP-NEXT:    ds_read_b128 v[20:23], v3 offset:8256
+; GCN-ILP-NEXT:    ds_read_b128 v[16:19], v3 offset:8240
+; GCN-ILP-NEXT:    ds_read_b128 v[12:15], v3 offset:8224
+; GCN-ILP-NEXT:    ds_read_b128 v[8:11], v3 offset:8208
+; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(7)
+; GCN-ILP-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 1
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[28:31] offset:112
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[24:27] offset:96
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[20:23] offset:80
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[16:19] offset:64
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[12:15] offset:48
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[8:11] offset:32
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[4:7] offset:16
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[0:3]
-; GCN-ILP-NEXT:    ds_read_b128 a[0:3], v3 offset:8192
-; GCN-ILP-NEXT:    ds_read_b128 a[4:7], v3 offset:8208
-; GCN-ILP-NEXT:    ds_read_b128 a[8:11], v3 offset:8224
-; GCN-ILP-NEXT:    ds_read_b128 a[12:15], v3 offset:8240
-; GCN-ILP-NEXT:    ds_read_b128 a[16:19], v3 offset:8256
-; GCN-ILP-NEXT:    ds_read_b128 a[20:23], v3 offset:8272
-; GCN-ILP-NEXT:    ds_read_b128 a[24:27], v3 offset:8288
-; GCN-ILP-NEXT:    ds_read_b128 a[28:31], v3 offset:8304
-; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-ILP-NEXT:    v_mov_b32_e32 v0, s1
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 1
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[24:27] offset:8288
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[28:31] offset:8304
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[16:19] offset:8256
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[20:23] offset:8272
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[8:11] offset:8224
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[12:15] offset:8240
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[0:3] offset:8192
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[4:7] offset:8208
-; GCN-ILP-NEXT:    ds_read_b128 a[0:3], v3 offset:24576
-; GCN-ILP-NEXT:    ds_read_b128 a[4:7], v3 offset:24592
-; GCN-ILP-NEXT:    ds_read_b128 a[8:11], v3 offset:24608
-; GCN-ILP-NEXT:    ds_read_b128 a[12:15], v3 offset:24624
-; GCN-ILP-NEXT:    ds_read_b128 a[16:19], v3 offset:24640
-; GCN-ILP-NEXT:    ds_read_b128 a[20:23], v3 offset:24656
-; GCN-ILP-NEXT:    ds_read_b128 a[24:27], v3 offset:24672
-; GCN-ILP-NEXT:    ds_read_b128 a[28:31], v3 offset:24688
-; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
 ; GCN-ILP-NEXT:    s_nop 2
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[4:7] offset:16400
-; GCN-ILP-NEXT:    ds_read_b128 a[4:7], v3 offset:49168
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[0:3] offset:16384
-; GCN-ILP-NEXT:    ds_read_b128 a[0:3], v3 offset:49152
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[12:15] offset:16432
-; GCN-ILP-NEXT:    ds_read_b128 a[12:15], v3 offset:49200
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[8:11] offset:16416
-; GCN-ILP-NEXT:    ds_read_b128 a[8:11], v3 offset:49184
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[20:23] offset:16464
-; GCN-ILP-NEXT:    ds_read_b128 a[20:23], v3 offset:49232
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[16:19] offset:16448
-; GCN-ILP-NEXT:    ds_read_b128 a[16:19], v3 offset:49216
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[28:31] offset:16496
-; GCN-ILP-NEXT:    ds_read_b128 a[28:31], v3 offset:49264
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[24:27] offset:16480
-; GCN-ILP-NEXT:    ds_read_b128 a[24:27], v3 offset:49248
+; GCN-ILP-NEXT:    buffer_store_dword a0, off, s[12:15], 0 offset:132 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v63, a4 ; Reload Reuse
+; GCN-ILP-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v7, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
 ; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; GCN-ILP-NEXT:    buffer_store_dword v8, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v9, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v10, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v11, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v12, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v13, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v14, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v15, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v16, off, s[12:15], 0 offset:52 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v17, off, s[12:15], 0 offset:56 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v18, off, s[12:15], 0 offset:60 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v19, off, s[12:15], 0 offset:64 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v20, off, s[12:15], 0 offset:68 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v21, off, s[12:15], 0 offset:72 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v22, off, s[12:15], 0 offset:76 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v23, off, s[12:15], 0 offset:80 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v24, off, s[12:15], 0 offset:84 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v25, off, s[12:15], 0 offset:88 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v26, off, s[12:15], 0 offset:92 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v27, off, s[12:15], 0 offset:96 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v28, off, s[12:15], 0 offset:100 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v29, off, s[12:15], 0 offset:104 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v30, off, s[12:15], 0 offset:108 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v31, off, s[12:15], 0 offset:112 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v32, off, s[12:15], 0 offset:116 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v33, off, s[12:15], 0 offset:120 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v34, off, s[12:15], 0 offset:124 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v35, off, s[12:15], 0 offset:128 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    ds_read_b128 v[32:35], v3 offset:24688
+; GCN-ILP-NEXT:    ds_read_b128 v[28:31], v3 offset:24672
+; GCN-ILP-NEXT:    ds_read_b128 v[24:27], v3 offset:24656
+; GCN-ILP-NEXT:    ds_read_b128 v[20:23], v3 offset:24640
+; GCN-ILP-NEXT:    ds_read_b128 v[16:19], v3 offset:24624
+; GCN-ILP-NEXT:    ds_read_b128 v[12:15], v3 offset:24608
+; GCN-ILP-NEXT:    ds_read_b128 v[8:11], v3 offset:24592
+; GCN-ILP-NEXT:    ds_read_b128 v[4:7], v3 offset:24576
+; GCN-ILP-NEXT:    ds_read_b128 a[60:63], v3 offset:49264
+; GCN-ILP-NEXT:    ds_read_b128 a[56:59], v3 offset:49248
+; GCN-ILP-NEXT:    ds_read_b128 a[52:55], v3 offset:49232
+; GCN-ILP-NEXT:    ds_read_b128 a[48:51], v3 offset:49216
+; GCN-ILP-NEXT:    ds_read_b128 a[44:47], v3 offset:49200
+; GCN-ILP-NEXT:    ds_read_b128 a[40:43], v3 offset:49184
+; GCN-ILP-NEXT:    ds_read_b128 a[36:39], v3 offset:49168
+; GCN-ILP-NEXT:    ds_read_b128 a[32:35], v3 offset:49152
 ; GCN-ILP-NEXT:    v_add_u32_e32 v3, 0x6000, v3
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 1
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[4:7] offset:24592
-; GCN-ILP-NEXT:    ds_read_b128 a[4:7], v3 offset:57360
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[0:3] offset:24576
-; GCN-ILP-NEXT:    ds_read_b128 a[0:3], v3 offset:57344
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[12:15] offset:24624
+; GCN-ILP-NEXT:    buffer_store_dword a1, off, s[12:15], 0 offset:136 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword a2, off, s[12:15], 0 offset:140 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword a3, off, s[12:15], 0 offset:144 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v62, a5 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v61, a6 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v60, a7 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v59, a8 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v58, a9 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v57, a10 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v56, a11 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v55, a12 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v54, a13 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v53, a14 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v52, a15 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v51, a16 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v50, a17 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v49, a18 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v48, a19 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v47, a20 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v46, a21 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v45, a22 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v44, a23 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v43, a24 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v42, a25 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v41, a26 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v40, a27 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v39, a28 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v38, a29 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v37, a30 ; Reload Reuse
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v36, a31 ; Reload Reuse
 ; GCN-ILP-NEXT:    ds_read_b128 a[12:15], v3 offset:57392
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[8:11] offset:24608
 ; GCN-ILP-NEXT:    ds_read_b128 a[8:11], v3 offset:57376
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[20:23] offset:24656
-; GCN-ILP-NEXT:    ds_read_b128 a[20:23], v3 offset:57424
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[16:19] offset:24640
+; GCN-ILP-NEXT:    ds_read_b128 a[4:7], v3 offset:57360
+; GCN-ILP-NEXT:    ds_read_b128 a[0:3], v3 offset:57344
 ; GCN-ILP-NEXT:    ds_read_b128 a[16:19], v3 offset:57408
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[28:31] offset:24688
-; GCN-ILP-NEXT:    ds_read_b128 a[28:31], v3 offset:57456
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[24:27] offset:24672
+; GCN-ILP-NEXT:    ds_read_b128 a[20:23], v3 offset:57424
 ; GCN-ILP-NEXT:    ds_read_b128 a[24:27], v3 offset:57440
+; GCN-ILP-NEXT:    ds_read_b128 a[28:31], v3 offset:57456
 ; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
-; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
+; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GCN-ILP-NEXT:    v_add_u32_e32 v0, s1, v2
+; GCN-ILP-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:264 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    s_nop 0
+; GCN-ILP-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-ILP-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:268 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:272 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v7, off, s[12:15], 0 offset:276 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v8, off, s[12:15], 0 offset:280 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v9, off, s[12:15], 0 offset:284 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v10, off, s[12:15], 0 offset:288 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v11, off, s[12:15], 0 offset:292 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v12, off, s[12:15], 0 offset:296 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v13, off, s[12:15], 0 offset:300 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v14, off, s[12:15], 0 offset:304 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v15, off, s[12:15], 0 offset:308 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v16, off, s[12:15], 0 offset:312 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v17, off, s[12:15], 0 offset:316 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v18, off, s[12:15], 0 offset:320 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v19, off, s[12:15], 0 offset:324 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v20, off, s[12:15], 0 offset:328 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v21, off, s[12:15], 0 offset:332 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v22, off, s[12:15], 0 offset:336 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v23, off, s[12:15], 0 offset:340 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v24, off, s[12:15], 0 offset:344 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v25, off, s[12:15], 0 offset:348 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v26, off, s[12:15], 0 offset:352 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v27, off, s[12:15], 0 offset:356 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v28, off, s[12:15], 0 offset:360 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v29, off, s[12:15], 0 offset:364 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v30, off, s[12:15], 0 offset:368 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v31, off, s[12:15], 0 offset:372 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v35, off, s[12:15], 0 offset:388 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:260 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:132 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    v_mov_b32_e32 v4, v63
+; GCN-ILP-NEXT:    buffer_store_dword v32, off, s[12:15], 0 offset:376 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:136 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:140 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:144 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    v_mov_b32_e32 v5, v62
+; GCN-ILP-NEXT:    buffer_load_dword v32, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    v_mov_b32_e32 v6, v61
+; GCN-ILP-NEXT:    v_mov_b32_e32 v7, v60
+; GCN-ILP-NEXT:    v_mov_b32_e32 v8, v59
+; GCN-ILP-NEXT:    v_mov_b32_e32 v9, v58
+; GCN-ILP-NEXT:    v_mov_b32_e32 v10, v57
+; GCN-ILP-NEXT:    v_mov_b32_e32 v11, v56
+; GCN-ILP-NEXT:    v_mov_b32_e32 v12, v55
+; GCN-ILP-NEXT:    v_mov_b32_e32 v13, v54
+; GCN-ILP-NEXT:    v_mov_b32_e32 v14, v53
+; GCN-ILP-NEXT:    v_mov_b32_e32 v15, v52
+; GCN-ILP-NEXT:    v_mov_b32_e32 v16, v51
+; GCN-ILP-NEXT:    v_mov_b32_e32 v17, v50
+; GCN-ILP-NEXT:    v_mov_b32_e32 v18, v49
+; GCN-ILP-NEXT:    v_mov_b32_e32 v19, v48
+; GCN-ILP-NEXT:    v_mov_b32_e32 v20, v47
+; GCN-ILP-NEXT:    v_mov_b32_e32 v21, v46
+; GCN-ILP-NEXT:    v_mov_b32_e32 v22, v45
+; GCN-ILP-NEXT:    v_mov_b32_e32 v23, v44
+; GCN-ILP-NEXT:    v_mov_b32_e32 v24, v43
+; GCN-ILP-NEXT:    v_mov_b32_e32 v25, v42
+; GCN-ILP-NEXT:    v_mov_b32_e32 v26, v41
+; GCN-ILP-NEXT:    v_mov_b32_e32 v27, v40
+; GCN-ILP-NEXT:    v_mov_b32_e32 v28, v39
+; GCN-ILP-NEXT:    v_mov_b32_e32 v29, v38
+; GCN-ILP-NEXT:    v_mov_b32_e32 v30, v37
+; GCN-ILP-NEXT:    s_waitcnt vmcnt(1)
+; GCN-ILP-NEXT:    v_mov_b32_e32 v31, v36
+; GCN-ILP-NEXT:    buffer_store_dword v33, off, s[12:15], 0 offset:380 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    buffer_store_dword v34, off, s[12:15], 0 offset:384 ; 4-byte Folded Spill
+; GCN-ILP-NEXT:    s_waitcnt vmcnt(2)
+; GCN-ILP-NEXT:    ds_write_b128 v32, v[0:3]
+; GCN-ILP-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:260 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    v_mov_b32_e32 v1, 2.0
+; GCN-ILP-NEXT:    s_waitcnt vmcnt(1)
+; GCN-ILP-NEXT:    ds_write_b128 v0, v[4:7] offset:16
+; GCN-ILP-NEXT:    ds_write_b128 v0, v[8:11] offset:32
+; GCN-ILP-NEXT:    ds_write_b128 v0, v[12:15] offset:48
+; GCN-ILP-NEXT:    ds_write_b128 v0, v[16:19] offset:64
+; GCN-ILP-NEXT:    ds_write_b128 v0, v[20:23] offset:80
+; GCN-ILP-NEXT:    ds_write_b128 v0, v[24:27] offset:96
+; GCN-ILP-NEXT:    ds_write_b128 v0, v[28:31] offset:112
+; GCN-ILP-NEXT:    v_mov_b32_e32 v0, 1.0
+; GCN-ILP-NEXT:    s_nop 1
+; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[32:63]
+; GCN-ILP-NEXT:    s_waitcnt vmcnt(0)
+; GCN-ILP-NEXT:    s_nop 7
+; GCN-ILP-NEXT:    s_nop 7
+; GCN-ILP-NEXT:    s_nop 1
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[56:59] offset:24672
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[60:63] offset:24688
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[48:51] offset:24640
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[52:55] offset:24656
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[40:43] offset:24608
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[44:47] offset:24624
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[32:35] offset:24576
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[36:39] offset:24592
+; GCN-ILP-NEXT:    buffer_load_dword a32, off, s[12:15], 0 offset:264 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a33, off, s[12:15], 0 offset:268 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a34, off, s[12:15], 0 offset:272 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a35, off, s[12:15], 0 offset:276 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a36, off, s[12:15], 0 offset:280 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a37, off, s[12:15], 0 offset:284 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a38, off, s[12:15], 0 offset:288 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a39, off, s[12:15], 0 offset:292 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a40, off, s[12:15], 0 offset:296 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a41, off, s[12:15], 0 offset:300 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a42, off, s[12:15], 0 offset:304 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a43, off, s[12:15], 0 offset:308 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a44, off, s[12:15], 0 offset:312 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a45, off, s[12:15], 0 offset:316 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a46, off, s[12:15], 0 offset:320 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a47, off, s[12:15], 0 offset:324 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a48, off, s[12:15], 0 offset:328 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a49, off, s[12:15], 0 offset:332 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a50, off, s[12:15], 0 offset:336 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a51, off, s[12:15], 0 offset:340 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a52, off, s[12:15], 0 offset:344 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a53, off, s[12:15], 0 offset:348 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a54, off, s[12:15], 0 offset:352 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a55, off, s[12:15], 0 offset:356 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a56, off, s[12:15], 0 offset:360 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a57, off, s[12:15], 0 offset:364 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a58, off, s[12:15], 0 offset:368 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a59, off, s[12:15], 0 offset:372 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a60, off, s[12:15], 0 offset:376 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a61, off, s[12:15], 0 offset:380 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a62, off, s[12:15], 0 offset:384 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a63, off, s[12:15], 0 offset:388 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    s_waitcnt vmcnt(0)
+; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[32:63]
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[24:27] offset:32864
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[28:31] offset:32880
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[16:19] offset:32832
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[20:23] offset:32848
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[8:11] offset:32800
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[12:15] offset:32816
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[0:3] offset:32768
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[4:7] offset:32784
+; GCN-ILP-NEXT:    buffer_load_dword a0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a3, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a4, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a5, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a6, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a7, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a8, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a9, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a10, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a11, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a12, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a13, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a14, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a15, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a16, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a17, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a18, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a19, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a20, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a21, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a22, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a23, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a24, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a25, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a26, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a27, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a28, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a29, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a30, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    buffer_load_dword a31, off, s[12:15], 0 offset:128 ; 4-byte Folded Reload
+; GCN-ILP-NEXT:    s_waitcnt vmcnt(0)
+; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    s_nop 7
 ; GCN-ILP-NEXT:    s_nop 7
 ; GCN-ILP-NEXT:    s_nop 2
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[24:27] offset:32864
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[28:31] offset:32880
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[16:19] offset:32832
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[20:23] offset:32848
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[8:11] offset:32800
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[12:15] offset:32816
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[0:3] offset:32768
-; GCN-ILP-NEXT:    ds_write_b128 v0, a[4:7] offset:32784
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v35, a31
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v34, a30
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v33, a29
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v32, a28
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v31, a27
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v30, a26
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v29, a25
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v28, a24
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v27, a23
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v26, a22
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v25, a21
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v24, a20
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v23, a19
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v22, a18
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v21, a17
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v20, a16
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v19, a15
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v18, a14
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v17, a13
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v16, a12
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v15, a11
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v14, a10
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v13, a9
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v12, a8
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v11, a7
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v10, a6
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v9, a5
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v8, a4
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v7, a3
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v6, a2
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v5, a1
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v4, a0
+; GCN-ILP-NEXT:    ds_write_b128 v2, v[28:31] offset:8288
+; GCN-ILP-NEXT:    ds_write_b128 v2, v[32:35] offset:8304
+; GCN-ILP-NEXT:    ds_write_b128 v2, v[20:23] offset:8256
+; GCN-ILP-NEXT:    ds_write_b128 v2, v[24:27] offset:8272
+; GCN-ILP-NEXT:    ds_write_b128 v2, v[12:15] offset:8224
+; GCN-ILP-NEXT:    ds_write_b128 v2, v[16:19] offset:8240
+; GCN-ILP-NEXT:    ds_write_b128 v2, v[4:7] offset:8192
+; GCN-ILP-NEXT:    ds_write_b128 v2, v[8:11] offset:8208
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[56:59] offset:16480
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[60:63] offset:16496
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[48:51] offset:16448
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[52:55] offset:16464
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[40:43] offset:16416
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[44:47] offset:16432
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[32:35] offset:16384
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[36:39] offset:16400
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
+; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    s_endpgm
 entry:
@@ -485,12 +1413,20 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MINREG-NEXT:    ds_read_b128 a[12:15], v3 offset:48
 ; GCN-MINREG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-MINREG-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GCN-MINREG-NEXT:    ds_read_b128 a[60:63], v3 offset:8304
+; GCN-MINREG-NEXT:    ds_read_b128 a[56:59], v3 offset:8288
+; GCN-MINREG-NEXT:    ds_read_b128 a[52:55], v3 offset:8272
+; GCN-MINREG-NEXT:    ds_read_b128 a[48:51], v3 offset:8256
+; GCN-MINREG-NEXT:    ds_read_b128 a[44:47], v3 offset:8240
+; GCN-MINREG-NEXT:    ds_read_b128 a[40:43], v3 offset:8224
+; GCN-MINREG-NEXT:    ds_read_b128 a[36:39], v3 offset:8208
+; GCN-MINREG-NEXT:    ds_read_b128 a[32:35], v3 offset:8192
 ; GCN-MINREG-NEXT:    v_add_u32_e32 v2, s1, v2
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-MINREG-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-MINREG-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v1, v0, a[32:63]
 ; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 1
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[28:31] offset:112
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[24:27] offset:96
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[20:23] offset:80
@@ -499,31 +1435,19 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[8:11] offset:32
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[4:7] offset:16
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[0:3]
-; GCN-MINREG-NEXT:    ds_read_b128 a[28:31], v3 offset:8304
-; GCN-MINREG-NEXT:    ds_read_b128 a[24:27], v3 offset:8288
-; GCN-MINREG-NEXT:    ds_read_b128 a[20:23], v3 offset:8272
-; GCN-MINREG-NEXT:    ds_read_b128 a[16:19], v3 offset:8256
-; GCN-MINREG-NEXT:    ds_read_b128 a[12:15], v3 offset:8240
-; GCN-MINREG-NEXT:    ds_read_b128 a[8:11], v3 offset:8224
-; GCN-MINREG-NEXT:    ds_read_b128 a[4:7], v3 offset:8208
-; GCN-MINREG-NEXT:    ds_read_b128 a[0:3], v3 offset:8192
-; GCN-MINREG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
 ; GCN-MINREG-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-MINREG-NEXT:    s_nop 1
+; GCN-MINREG-NEXT:    ds_write_b128 v2, a[56:59] offset:8288
+; GCN-MINREG-NEXT:    ds_write_b128 v2, a[60:63] offset:8304
+; GCN-MINREG-NEXT:    ds_write_b128 v2, a[48:51] offset:8256
+; GCN-MINREG-NEXT:    ds_write_b128 v2, a[52:55] offset:8272
+; GCN-MINREG-NEXT:    ds_write_b128 v2, a[40:43] offset:8224
+; GCN-MINREG-NEXT:    ds_write_b128 v2, a[44:47] offset:8240
+; GCN-MINREG-NEXT:    ds_write_b128 v2, a[32:35] offset:8192
+; GCN-MINREG-NEXT:    ds_write_b128 v2, a[36:39] offset:8208
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 1
-; GCN-MINREG-NEXT:    ds_write_b128 v2, a[24:27] offset:8288
-; GCN-MINREG-NEXT:    ds_write_b128 v2, a[28:31] offset:8304
-; GCN-MINREG-NEXT:    ds_write_b128 v2, a[16:19] offset:8256
-; GCN-MINREG-NEXT:    ds_write_b128 v2, a[20:23] offset:8272
-; GCN-MINREG-NEXT:    ds_write_b128 v2, a[8:11] offset:8224
-; GCN-MINREG-NEXT:    ds_write_b128 v2, a[12:15] offset:8240
-; GCN-MINREG-NEXT:    ds_write_b128 v2, a[0:3] offset:8192
-; GCN-MINREG-NEXT:    ds_write_b128 v2, a[4:7] offset:8208
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_barrier mask(0x00000000)
 ; GCN-MINREG-NEXT:    ds_read_b128 a[28:31], v3 offset:24688
@@ -536,44 +1460,54 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MINREG-NEXT:    ds_read_b128 a[12:15], v3 offset:24624
 ; GCN-MINREG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-MINREG-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GCN-MINREG-NEXT:    ds_read_b128 a[60:63], v3 offset:49264
+; GCN-MINREG-NEXT:    ds_read_b128 a[56:59], v3 offset:49248
+; GCN-MINREG-NEXT:    ds_read_b128 a[52:55], v3 offset:49232
+; GCN-MINREG-NEXT:    ds_read_b128 a[48:51], v3 offset:49216
+; GCN-MINREG-NEXT:    ds_read_b128 a[44:47], v3 offset:49200
+; GCN-MINREG-NEXT:    ds_read_b128 a[40:43], v3 offset:49184
+; GCN-MINREG-NEXT:    ds_read_b128 a[36:39], v3 offset:49168
+; GCN-MINREG-NEXT:    ds_read_b128 a[32:35], v3 offset:49152
 ; GCN-MINREG-NEXT:    v_add_u32_e32 v4, 0x6000, v3
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 1
-; GCN-MINREG-NEXT:    ds_write_b128 v2, a[28:31] offset:16496
-; GCN-MINREG-NEXT:    ds_write_b128 v2, a[24:27] offset:16480
-; GCN-MINREG-NEXT:    ds_write_b128 v2, a[20:23] offset:16464
-; GCN-MINREG-NEXT:    ds_write_b128 v2, a[16:19] offset:16448
-; GCN-MINREG-NEXT:    ds_write_b128 v2, a[12:15] offset:16432
-; GCN-MINREG-NEXT:    ds_write_b128 v2, a[8:11] offset:16416
-; GCN-MINREG-NEXT:    ds_write_b128 v2, a[4:7] offset:16400
-; GCN-MINREG-NEXT:    ds_write_b128 v2, a[0:3] offset:16384
-; GCN-MINREG-NEXT:    ds_read_b128 a[28:31], v3 offset:49264
-; GCN-MINREG-NEXT:    ds_read_b128 a[24:27], v3 offset:49248
-; GCN-MINREG-NEXT:    ds_read_b128 a[20:23], v3 offset:49232
-; GCN-MINREG-NEXT:    ds_read_b128 a[16:19], v3 offset:49216
-; GCN-MINREG-NEXT:    ds_read_b128 a[12:15], v3 offset:49200
-; GCN-MINREG-NEXT:    ds_read_b128 a[8:11], v3 offset:49184
-; GCN-MINREG-NEXT:    ds_read_b128 a[4:7], v3 offset:49168
-; GCN-MINREG-NEXT:    ds_read_b128 a[0:3], v3 offset:49152
-; GCN-MINREG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
-; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
+; GCN-MINREG-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-MINREG-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v1, v0, a[32:63]
 ; GCN-MINREG-NEXT:    s_nop 7
-; GCN-MINREG-NEXT:    s_nop 2
-; GCN-MINREG-NEXT:    ds_write_b128 v2, a[28:31] offset:24688
-; GCN-MINREG-NEXT:    ds_write_b128 v2, a[24:27] offset:24672
-; GCN-MINREG-NEXT:    ds_write_b128 v2, a[20:23] offset:24656
-; GCN-MINREG-NEXT:    ds_write_b128 v2, a[16:19] offset:24640
-; GCN-MINREG-NEXT:    ds_write_b128 v2, a[12:15] offset:24624
-; GCN-MINREG-NEXT:    ds_write_b128 v2, a[8:11] offset:24608
-; GCN-MINREG-NEXT:    ds_write_b128 v2, a[4:7] offset:24592
-; GCN-MINREG-NEXT:    ds_write_b128 v2, a[0:3] offset:24576
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v37, a31
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v36, a30
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v35, a29
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v34, a28
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v33, a27
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v32, a26
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v31, a25
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v30, a24
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v29, a23
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v28, a22
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v27, a21
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v26, a20
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v25, a19
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v24, a18
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v23, a17
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v22, a16
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v21, a15
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v20, a14
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v19, a13
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v18, a12
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v17, a11
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v16, a10
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v15, a9
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v14, a8
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v13, a7
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v12, a6
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v11, a5
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v10, a4
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v9, a3
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v8, a2
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v7, a1
+; GCN-MINREG-NEXT:    v_accvgpr_read_b32 v6, a0
 ; GCN-MINREG-NEXT:    ds_read_b128 a[28:31], v4 offset:57456
 ; GCN-MINREG-NEXT:    ds_read_b128 a[24:27], v4 offset:57440
 ; GCN-MINREG-NEXT:    ds_read_b128 a[20:23], v4 offset:57424
@@ -582,12 +1516,28 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MINREG-NEXT:    ds_read_b128 a[4:7], v4 offset:57360
 ; GCN-MINREG-NEXT:    ds_read_b128 a[8:11], v4 offset:57376
 ; GCN-MINREG-NEXT:    ds_read_b128 a[12:15], v4 offset:57392
-; GCN-MINREG-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-MINREG-NEXT:    ds_write_b128 v2, a[60:63] offset:24688
+; GCN-MINREG-NEXT:    ds_write_b128 v2, a[56:59] offset:24672
+; GCN-MINREG-NEXT:    ds_write_b128 v2, a[52:55] offset:24656
+; GCN-MINREG-NEXT:    ds_write_b128 v2, a[48:51] offset:24640
+; GCN-MINREG-NEXT:    ds_write_b128 v2, a[44:47] offset:24624
+; GCN-MINREG-NEXT:    ds_write_b128 v2, a[40:43] offset:24608
+; GCN-MINREG-NEXT:    ds_write_b128 v2, a[36:39] offset:24592
+; GCN-MINREG-NEXT:    ds_write_b128 v2, a[32:35] offset:24576
+; GCN-MINREG-NEXT:    s_waitcnt lgkmcnt(8)
 ; GCN-MINREG-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MINREG-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MINREG-NEXT:    s_nop 7
+; GCN-MINREG-NEXT:    ds_write_b128 v2, v[34:37] offset:16496
+; GCN-MINREG-NEXT:    ds_write_b128 v2, v[30:33] offset:16480
+; GCN-MINREG-NEXT:    ds_write_b128 v2, v[26:29] offset:16464
+; GCN-MINREG-NEXT:    ds_write_b128 v2, v[22:25] offset:16448
+; GCN-MINREG-NEXT:    ds_write_b128 v2, v[18:21] offset:16432
+; GCN-MINREG-NEXT:    ds_write_b128 v2, v[14:17] offset:16416
+; GCN-MINREG-NEXT:    ds_write_b128 v2, v[10:13] offset:16400
+; GCN-MINREG-NEXT:    ds_write_b128 v2, v[6:9] offset:16384
 ; GCN-MINREG-NEXT:    s_nop 7
 ; GCN-MINREG-NEXT:    s_nop 2
 ; GCN-MINREG-NEXT:    ds_write_b128 v2, a[28:31] offset:32880
@@ -605,134 +1555,210 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-MAXOCC:       ; %bb.0: ; %entry
 ; GCN-MAXOCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GCN-MAXOCC-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GCN-MAXOCC-NEXT:    v_lshlrev_b32_e32 v3, 7, v0
-; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v1, 1.0
-; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v2, 2.0
+; GCN-MAXOCC-NEXT:    v_lshlrev_b32_e32 v2, 7, v0
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v0, 1.0
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GCN-MAXOCC-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT:    v_add_u32_e32 v0, s0, v3
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[28:31], v0 offset:112
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[24:27], v0 offset:96
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[20:23], v0 offset:80
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[16:19], v0 offset:64
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[0:3], v0
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[4:7], v0 offset:16
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[8:11], v0 offset:32
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[12:15], v0 offset:48
+; GCN-MAXOCC-NEXT:    v_add_u32_e32 v3, s0, v2
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[28:31], v3 offset:112
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[24:27], v3 offset:96
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[20:23], v3 offset:80
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[16:19], v3 offset:64
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[0:3], v3
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[4:7], v3 offset:16
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[8:11], v3 offset:32
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[12:15], v3 offset:48
 ; GCN-MAXOCC-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-MAXOCC-NEXT:    v_add_u32_e32 v3, s1, v3
+; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[60:63], v3 offset:8304
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[56:59], v3 offset:8288
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[52:55], v3 offset:8272
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[48:51], v3 offset:8256
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[44:47], v3 offset:8240
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[40:43], v3 offset:8224
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[36:39], v3 offset:8208
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[32:35], v3 offset:8192
+; GCN-MAXOCC-NEXT:    v_add_u32_e32 v2, s1, v2
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-MAXOCC-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[32:63]
 ; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[28:31] offset:112
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[24:27] offset:96
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[20:23] offset:80
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[16:19] offset:64
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[12:15] offset:48
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[8:11] offset:32
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[4:7] offset:16
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[0:3]
+; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-MAXOCC-NEXT:    s_nop 1
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[28:31] offset:112
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[24:27] offset:96
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[20:23] offset:80
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[16:19] offset:64
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[12:15] offset:48
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[8:11] offset:32
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[4:7] offset:16
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[0:3]
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[28:31], v0 offset:8304
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[24:27], v0 offset:8288
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[20:23], v0 offset:8272
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[16:19], v0 offset:8256
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[12:15], v0 offset:8240
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[8:11], v0 offset:8224
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[4:7], v0 offset:8208
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[0:3], v0 offset:8192
-; GCN-MAXOCC-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-MAXOCC-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[56:59] offset:8288
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[60:63] offset:8304
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[48:51] offset:8256
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[52:55] offset:8272
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[40:43] offset:8224
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[44:47] offset:8240
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[32:35] offset:8192
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[36:39] offset:8208
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 1
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[24:27] offset:8288
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[28:31] offset:8304
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[16:19] offset:8256
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[20:23] offset:8272
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[8:11] offset:8224
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[12:15] offset:8240
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[0:3] offset:8192
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[4:7] offset:8208
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_barrier mask(0x00000000)
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[28:31], v0 offset:24688
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[24:27], v0 offset:24672
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[20:23], v0 offset:24656
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[16:19], v0 offset:24640
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[0:3], v0 offset:24576
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[4:7], v0 offset:24592
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[8:11], v0 offset:24608
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[12:15], v0 offset:24624
+; GCN-MAXOCC-NEXT:    ds_read_b128 v[4:7], v3 offset:24576
+; GCN-MAXOCC-NEXT:    ds_read_b128 v[32:35], v3 offset:24688
+; GCN-MAXOCC-NEXT:    ds_read_b128 v[28:31], v3 offset:24672
+; GCN-MAXOCC-NEXT:    ds_read_b128 v[24:27], v3 offset:24656
+; GCN-MAXOCC-NEXT:    ds_read_b128 v[20:23], v3 offset:24640
+; GCN-MAXOCC-NEXT:    ds_read_b128 v[8:11], v3 offset:24592
+; GCN-MAXOCC-NEXT:    ds_read_b128 v[12:15], v3 offset:24608
+; GCN-MAXOCC-NEXT:    ds_read_b128 v[16:19], v3 offset:24624
 ; GCN-MAXOCC-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a0, v4
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a1, v5
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a2, v6
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a3, v7
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a4, v8
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a5, v9
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a6, v10
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a7, v11
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a8, v12
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a9, v13
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a10, v14
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a11, v15
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a12, v16
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a13, v17
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a14, v18
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a15, v19
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a16, v20
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a17, v21
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a18, v22
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a19, v23
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a20, v24
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a21, v25
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a22, v26
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a23, v27
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a24, v28
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a25, v29
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a26, v30
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a27, v31
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a28, v32
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a29, v33
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a30, v34
+; GCN-MAXOCC-NEXT:    v_accvgpr_write_b32 a31, v35
+; GCN-MAXOCC-NEXT:    v_add_u32_e32 v4, 0x6000, v3
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
+; GCN-MAXOCC-NEXT:    s_nop 0
+; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[60:63], v3 offset:49264
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[56:59], v3 offset:49248
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[52:55], v3 offset:49232
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[48:51], v3 offset:49216
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[44:47], v3 offset:49200
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[40:43], v3 offset:49184
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[36:39], v3 offset:49168
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[32:35], v3 offset:49152
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 7
 ; GCN-MAXOCC-NEXT:    s_nop 2
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[28:31] offset:16496
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[24:27] offset:16480
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[20:23] offset:16464
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[16:19] offset:16448
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[12:15] offset:16432
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[8:11] offset:16416
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[4:7] offset:16400
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[0:3] offset:16384
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[28:31], v0 offset:49264
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[24:27], v0 offset:49248
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[20:23], v0 offset:49232
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[16:19], v0 offset:49216
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[12:15], v0 offset:49200
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[8:11], v0 offset:49184
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[4:7], v0 offset:49168
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[0:3], v0 offset:49152
-; GCN-MAXOCC-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-MAXOCC-NEXT:    v_add_u32_e32 v0, 0x6000, v0
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v37, a31
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v35, a29
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v34, a28
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v33, a27
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v32, a26
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v31, a25
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v30, a24
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v29, a23
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v28, a22
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v27, a21
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v26, a20
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v25, a19
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v24, a18
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v23, a17
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v22, a16
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v21, a15
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v20, a14
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v19, a13
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v18, a12
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v17, a11
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v16, a10
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v15, a9
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v14, a8
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v13, a7
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v12, a6
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v11, a5
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v10, a4
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v9, a3
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v8, a2
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v7, a1
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v6, a0
+; GCN-MAXOCC-NEXT:    v_accvgpr_read_b32 v36, a30
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[28:31], v4 offset:57456
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[24:27], v4 offset:57440
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[20:23], v4 offset:57424
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[16:19], v4 offset:57408
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[0:3], v4 offset:57344
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[4:7], v4 offset:57360
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[8:11], v4 offset:57376
+; GCN-MAXOCC-NEXT:    ds_read_b128 a[12:15], v4 offset:57392
+; GCN-MAXOCC-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
+; GCN-MAXOCC-NEXT:    v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
+; GCN-MAXOCC-NEXT:    v_pk_mov_b32 v[8:9], v[10:11], v[10:11] op_sel:[0,1]
+; GCN-MAXOCC-NEXT:    v_pk_mov_b32 v[10:11], v[12:13], v[12:13] op_sel:[0,1]
+; GCN-MAXOCC-NEXT:    v_pk_mov_b32 v[12:13], v[14:15], v[14:15] op_sel:[0,1]
+; GCN-MAXOCC-NEXT:    v_pk_mov_b32 v[14:15], v[16:17], v[16:17] op_sel:[0,1]
+; GCN-MAXOCC-NEXT:    v_pk_mov_b32 v[16:17], v[18:19], v[18:19] op_sel:[0,1]
+; GCN-MAXOCC-NEXT:    v_pk_mov_b32 v[18:19], v[20:21], v[20:21] op_sel:[0,1]
+; GCN-MAXOCC-NEXT:    v_pk_mov_b32 v[20:21], v[22:23], v[22:23] op_sel:[0,1]
+; GCN-MAXOCC-NEXT:    v_pk_mov_b32 v[22:23], v[24:25], v[24:25] op_sel:[0,1]
+; GCN-MAXOCC-NEXT:    v_pk_mov_b32 v[24:25], v[26:27], v[26:27] op_sel:[0,1]
+; GCN-MAXOCC-NEXT:    v_pk_mov_b32 v[26:27], v[28:29], v[28:29] op_sel:[0,1]
+; GCN-MAXOCC-NEXT:    v_pk_mov_b32 v[28:29], v[30:31], v[30:31] op_sel:[0,1]
+; GCN-MAXOCC-NEXT:    v_pk_mov_b32 v[30:31], v[32:33], v[32:33] op_sel:[0,1]
+; GCN-MAXOCC-NEXT:    v_pk_mov_b32 v[32:33], v[34:35], v[34:35] op_sel:[0,1]
+; GCN-MAXOCC-NEXT:    v_pk_mov_b32 v[34:35], v[36:37], v[36:37] op_sel:[0,1]
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, v[32:35] offset:16496
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, v[28:31] offset:16480
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, v[24:27] offset:16464
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, v[20:23] offset:16448
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, v[16:19] offset:16432
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, v[12:15] offset:16416
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, v[8:11] offset:16400
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, v[4:7] offset:16384
+; GCN-MAXOCC-NEXT:    s_waitcnt lgkmcnt(14)
+; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[32:63]
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-MAXOCC-NEXT:    s_nop 7
 ; GCN-MAXOCC-NEXT:    s_nop 7
-; GCN-MAXOCC-NEXT:    s_nop 1
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[28:31] offset:24688
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[24:27] offset:24672
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[20:23] offset:24656
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[16:19] offset:24640
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[12:15] offset:24624
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[8:11] offset:24608
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[4:7] offset:24592
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[0:3] offset:24576
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[28:31], v0 offset:57456
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[24:27], v0 offset:57440
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[20:23], v0 offset:57424
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[16:19], v0 offset:57408
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[0:3], v0 offset:57344
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[4:7], v0 offset:57360
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[8:11], v0 offset:57376
-; GCN-MAXOCC-NEXT:    ds_read_b128 a[12:15], v0 offset:57392
-; GCN-MAXOCC-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; GCN-MAXOCC-NEXT:    s_nop 2
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[60:63] offset:24688
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[56:59] offset:24672
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[52:55] offset:24656
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[48:51] offset:24640
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[44:47] offset:24624
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[40:43] offset:24608
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[36:39] offset:24592
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[32:35] offset:24576
+; GCN-MAXOCC-NEXT:    s_waitcnt lgkmcnt(14)
+; GCN-MAXOCC-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-MAXOCC-NEXT:    s_nop 7
 ; GCN-MAXOCC-NEXT:    s_nop 7
 ; GCN-MAXOCC-NEXT:    s_nop 2
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[28:31] offset:32880
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[24:27] offset:32864
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[20:23] offset:32848
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[16:19] offset:32832
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[12:15] offset:32816
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[8:11] offset:32800
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[4:7] offset:32784
-; GCN-MAXOCC-NEXT:    ds_write_b128 v3, a[0:3] offset:32768
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[28:31] offset:32880
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[24:27] offset:32864
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[20:23] offset:32848
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[16:19] offset:32832
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[12:15] offset:32816
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[8:11] offset:32800
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[4:7] offset:32784
+; GCN-MAXOCC-NEXT:    ds_write_b128 v2, a[0:3] offset:32768
 ; GCN-MAXOCC-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-MAXOCC-NEXT:    s_endpgm
 ;
@@ -745,47 +1771,40 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-ILP-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-ILP-NEXT:    v_add_u32_e32 v3, s0, v2
-; GCN-ILP-NEXT:    ds_read_b128 a[12:15], v3 offset:48
-; GCN-ILP-NEXT:    ds_read_b128 a[8:11], v3 offset:32
-; GCN-ILP-NEXT:    ds_read_b128 a[4:7], v3 offset:16
-; GCN-ILP-NEXT:    ds_read_b128 a[0:3], v3
-; GCN-ILP-NEXT:    ds_read_b128 a[16:19], v3 offset:64
-; GCN-ILP-NEXT:    ds_read_b128 a[20:23], v3 offset:80
-; GCN-ILP-NEXT:    ds_read_b128 a[24:27], v3 offset:96
-; GCN-ILP-NEXT:    ds_read_b128 a[28:31], v3 offset:112
+; GCN-ILP-NEXT:    ds_read_b128 a[44:47], v3 offset:48
+; GCN-ILP-NEXT:    ds_read_b128 a[40:43], v3 offset:32
+; GCN-ILP-NEXT:    ds_read_b128 a[36:39], v3 offset:16
+; GCN-ILP-NEXT:    ds_read_b128 a[32:35], v3
+; GCN-ILP-NEXT:    ds_read_b128 a[48:51], v3 offset:64
+; GCN-ILP-NEXT:    ds_read_b128 a[52:55], v3 offset:80
+; GCN-ILP-NEXT:    ds_read_b128 a[56:59], v3 offset:96
+; GCN-ILP-NEXT:    ds_read_b128 a[60:63], v3 offset:112
 ; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GCN-ILP-NEXT:    v_add_u32_e32 v2, s1, v2
-; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
-; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 1
-; GCN-ILP-NEXT:    ds_write_b128 v2, a[0:3]
+; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[32:63]
 ; GCN-ILP-NEXT:    ds_read_b128 a[0:3], v3 offset:8192
-; GCN-ILP-NEXT:    ds_write_b128 v2, a[4:7] offset:16
 ; GCN-ILP-NEXT:    ds_read_b128 a[4:7], v3 offset:8208
-; GCN-ILP-NEXT:    ds_write_b128 v2, a[8:11] offset:32
 ; GCN-ILP-NEXT:    ds_read_b128 a[8:11], v3 offset:8224
-; GCN-ILP-NEXT:    ds_write_b128 v2, a[12:15] offset:48
 ; GCN-ILP-NEXT:    ds_read_b128 a[12:15], v3 offset:8240
-; GCN-ILP-NEXT:    ds_write_b128 v2, a[16:19] offset:64
 ; GCN-ILP-NEXT:    ds_read_b128 a[16:19], v3 offset:8256
-; GCN-ILP-NEXT:    ds_write_b128 v2, a[20:23] offset:80
 ; GCN-ILP-NEXT:    ds_read_b128 a[20:23], v3 offset:8272
-; GCN-ILP-NEXT:    ds_write_b128 v2, a[24:27] offset:96
 ; GCN-ILP-NEXT:    ds_read_b128 a[24:27], v3 offset:8288
-; GCN-ILP-NEXT:    ds_write_b128 v2, a[28:31] offset:112
 ; GCN-ILP-NEXT:    ds_read_b128 a[28:31], v3 offset:8304
-; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GCN-ILP-NEXT:    v_add_u32_e32 v4, s1, v2
 ; GCN-ILP-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
-; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 1
+; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GCN-ILP-NEXT:    s_nop 6
+; GCN-ILP-NEXT:    ds_write_b128 v4, a[32:35]
+; GCN-ILP-NEXT:    ds_write_b128 v4, a[36:39] offset:16
+; GCN-ILP-NEXT:    ds_write_b128 v4, a[40:43] offset:32
+; GCN-ILP-NEXT:    ds_write_b128 v4, a[44:47] offset:48
+; GCN-ILP-NEXT:    ds_write_b128 v4, a[48:51] offset:64
+; GCN-ILP-NEXT:    ds_write_b128 v4, a[52:55] offset:80
+; GCN-ILP-NEXT:    ds_write_b128 v4, a[56:59] offset:96
+; GCN-ILP-NEXT:    ds_write_b128 v4, a[60:63] offset:112
+; GCN-ILP-NEXT:    s_nop 3
 ; GCN-ILP-NEXT:    ds_write_b128 v2, a[24:27] offset:8288
 ; GCN-ILP-NEXT:    ds_write_b128 v2, a[28:31] offset:8304
 ; GCN-ILP-NEXT:    ds_write_b128 v2, a[16:19] offset:8256
@@ -795,6 +1814,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-ILP-NEXT:    ds_write_b128 v2, a[0:3] offset:8192
 ; GCN-ILP-NEXT:    ds_write_b128 v2, a[4:7] offset:8208
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
+; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
+; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_barrier mask(0x00000000)
 ; GCN-ILP-NEXT:    ds_read_b128 a[12:15], v3 offset:24624
 ; GCN-ILP-NEXT:    ds_read_b128 a[8:11], v3 offset:24608
@@ -806,53 +1828,82 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-ILP-NEXT:    ds_read_b128 a[28:31], v3 offset:24688
 ; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GCN-ILP-NEXT:    ds_read_b128 a[60:63], v3 offset:49264
+; GCN-ILP-NEXT:    ds_read_b128 a[56:59], v3 offset:49248
+; GCN-ILP-NEXT:    ds_read_b128 a[52:55], v3 offset:49232
+; GCN-ILP-NEXT:    ds_read_b128 a[48:51], v3 offset:49216
+; GCN-ILP-NEXT:    ds_read_b128 a[44:47], v3 offset:49200
+; GCN-ILP-NEXT:    ds_read_b128 a[40:43], v3 offset:49184
+; GCN-ILP-NEXT:    ds_read_b128 a[36:39], v3 offset:49168
+; GCN-ILP-NEXT:    ds_read_b128 a[32:35], v3 offset:49152
+; GCN-ILP-NEXT:    v_add_u32_e32 v3, 0x6000, v3
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 2
-; GCN-ILP-NEXT:    ds_write_b128 v2, a[28:31] offset:16496
-; GCN-ILP-NEXT:    ds_write_b128 v2, a[24:27] offset:16480
-; GCN-ILP-NEXT:    ds_write_b128 v2, a[20:23] offset:16464
-; GCN-ILP-NEXT:    ds_write_b128 v2, a[16:19] offset:16448
-; GCN-ILP-NEXT:    ds_write_b128 v2, a[12:15] offset:16432
-; GCN-ILP-NEXT:    ds_write_b128 v2, a[8:11] offset:16416
-; GCN-ILP-NEXT:    ds_write_b128 v2, a[4:7] offset:16400
-; GCN-ILP-NEXT:    ds_write_b128 v2, a[0:3] offset:16384
-; GCN-ILP-NEXT:    ds_read_b128 a[0:3], v3 offset:49152
-; GCN-ILP-NEXT:    ds_read_b128 a[4:7], v3 offset:49168
-; GCN-ILP-NEXT:    ds_read_b128 a[8:11], v3 offset:49184
-; GCN-ILP-NEXT:    ds_read_b128 a[12:15], v3 offset:49200
-; GCN-ILP-NEXT:    ds_read_b128 a[16:19], v3 offset:49216
-; GCN-ILP-NEXT:    ds_read_b128 a[20:23], v3 offset:49232
-; GCN-ILP-NEXT:    ds_read_b128 a[24:27], v3 offset:49248
-; GCN-ILP-NEXT:    ds_read_b128 a[28:31], v3 offset:49264
 ; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GCN-ILP-NEXT:    v_add_u32_e32 v3, 0x6000, v3
-; GCN-ILP-NEXT:    s_nop 7
+; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[32:63]
 ; GCN-ILP-NEXT:    s_nop 7
-; GCN-ILP-NEXT:    s_nop 1
-; GCN-ILP-NEXT:    ds_write_b128 v2, a[0:3] offset:24576
-; GCN-ILP-NEXT:    ds_read_b128 a[0:3], v3 offset:57344
-; GCN-ILP-NEXT:    ds_write_b128 v2, a[4:7] offset:24592
-; GCN-ILP-NEXT:    ds_read_b128 a[4:7], v3 offset:57360
-; GCN-ILP-NEXT:    ds_write_b128 v2, a[8:11] offset:24608
-; GCN-ILP-NEXT:    ds_read_b128 a[8:11], v3 offset:57376
-; GCN-ILP-NEXT:    ds_write_b128 v2, a[12:15] offset:24624
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v35, a31
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v34, a30
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v33, a29
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v32, a28
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v31, a27
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v30, a26
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v29, a25
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v28, a24
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v27, a23
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v26, a22
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v25, a21
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v24, a20
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v23, a19
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v22, a18
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v21, a17
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v20, a16
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v19, a15
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v18, a14
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v17, a13
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v16, a12
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v15, a11
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v14, a10
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v13, a9
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v12, a8
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v11, a7
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v10, a6
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v9, a5
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v8, a4
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v7, a3
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v6, a2
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v5, a1
+; GCN-ILP-NEXT:    v_accvgpr_read_b32 v4, a0
 ; GCN-ILP-NEXT:    ds_read_b128 a[12:15], v3 offset:57392
-; GCN-ILP-NEXT:    ds_write_b128 v2, a[16:19] offset:24640
+; GCN-ILP-NEXT:    ds_read_b128 a[8:11], v3 offset:57376
+; GCN-ILP-NEXT:    ds_read_b128 a[4:7], v3 offset:57360
+; GCN-ILP-NEXT:    ds_read_b128 a[0:3], v3 offset:57344
 ; GCN-ILP-NEXT:    ds_read_b128 a[16:19], v3 offset:57408
-; GCN-ILP-NEXT:    ds_write_b128 v2, a[20:23] offset:24656
 ; GCN-ILP-NEXT:    ds_read_b128 a[20:23], v3 offset:57424
-; GCN-ILP-NEXT:    ds_write_b128 v2, a[24:27] offset:24672
 ; GCN-ILP-NEXT:    ds_read_b128 a[24:27], v3 offset:57440
-; GCN-ILP-NEXT:    ds_write_b128 v2, a[28:31] offset:24688
 ; GCN-ILP-NEXT:    ds_read_b128 a[28:31], v3 offset:57456
-; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[60:63] offset:24688
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[56:59] offset:24672
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[52:55] offset:24656
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[48:51] offset:24640
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[44:47] offset:24624
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[40:43] offset:24608
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[36:39] offset:24592
+; GCN-ILP-NEXT:    ds_write_b128 v2, a[32:35] offset:24576
+; GCN-ILP-NEXT:    ds_write_b128 v2, v[32:35] offset:16496
+; GCN-ILP-NEXT:    ds_write_b128 v2, v[28:31] offset:16480
+; GCN-ILP-NEXT:    ds_write_b128 v2, v[24:27] offset:16464
+; GCN-ILP-NEXT:    ds_write_b128 v2, v[20:23] offset:16448
+; GCN-ILP-NEXT:    ds_write_b128 v2, v[16:19] offset:16432
+; GCN-ILP-NEXT:    ds_write_b128 v2, v[12:15] offset:16416
+; GCN-ILP-NEXT:    ds_write_b128 v2, v[8:11] offset:16400
+; GCN-ILP-NEXT:    ds_write_b128 v2, v[4:7] offset:16384
+; GCN-ILP-NEXT:    s_waitcnt lgkmcnt(14)
 ; GCN-ILP-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
+; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    s_nop 7
@@ -867,8 +1918,6 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
 ; GCN-ILP-NEXT:    ds_write_b128 v2, a[4:7] offset:32784
 ; GCN-ILP-NEXT:    ds_write_b128 v2, a[0:3] offset:32768
 ; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
-; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-ILP-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-ILP-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
index 73586b1243376..266df5d56b5c2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
@@ -387,88 +387,87 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v16, 7, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
 ; GCN-NEXT:    ; kill: killed $sgpr0_sgpr1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:112
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:96
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_lo_u32 v13, v13, v13
-; GCN-NEXT:    v_mul_lo_u32 v12, v12, v12
-; GCN-NEXT:    v_mul_lo_u32 v15, v15, v15
-; GCN-NEXT:    v_mul_lo_u32 v14, v14, v14
-; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:32
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
-; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
-; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_lo_u32 v7, v7, v7
+; GCN-NEXT:    v_mul_lo_u32 v6, v6, v6
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:80
+; GCN-NEXT:    v_mul_lo_u32 v5, v5, v5
+; GCN-NEXT:    v_mul_lo_u32 v4, v4, v4
+; GCN-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; GCN-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:32
+; GCN-NEXT:    s_waitcnt vmcnt(4)
+; GCN-NEXT:    v_mul_lo_u32 v11, v11, v11
+; GCN-NEXT:    v_mul_lo_u32 v10, v10, v10
+; GCN-NEXT:    v_mul_lo_u32 v9, v9, v9
+; GCN-NEXT:    v_mul_lo_u32 v8, v8, v8
+; GCN-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:112
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
+; GCN-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:96
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:112
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
-; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:112
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:96
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
-; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:96
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:80
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_lo_u32 v3, v3, v3
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, v2
-; GCN-NEXT:    v_mul_lo_u32 v1, v1, v1
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, v0
-; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:80
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:48
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
+; GCN-NEXT:    s_waitcnt vmcnt(5)
+; GCN-NEXT:    v_mul_lo_u32 v15, v15, v15
+; GCN-NEXT:    v_mul_lo_u32 v14, v14, v14
+; GCN-NEXT:    v_mul_lo_u32 v13, v13, v13
+; GCN-NEXT:    s_waitcnt vmcnt(4)
+; GCN-NEXT:    v_mul_lo_u32 v19, v19, v19
+; GCN-NEXT:    v_mul_lo_u32 v18, v18, v18
+; GCN-NEXT:    v_mul_lo_u32 v17, v17, v17
+; GCN-NEXT:    v_mul_lo_u32 v16, v16, v16
+; GCN-NEXT:    v_mul_lo_u32 v12, v12, v12
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_mul_lo_u32 v23, v23, v23
+; GCN-NEXT:    v_mul_lo_u32 v22, v22, v22
+; GCN-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:80
+; GCN-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:64
+; GCN-NEXT:    v_mul_lo_u32 v21, v21, v21
+; GCN-NEXT:    v_mul_lo_u32 v20, v20, v20
+; GCN-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:48
+; GCN-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_lo_u32 v7, v7, v7
-; GCN-NEXT:    v_mul_lo_u32 v6, v6, v6
-; GCN-NEXT:    v_mul_lo_u32 v5, v5, v5
-; GCN-NEXT:    v_mul_lo_u32 v4, v4, v4
-; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:48
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:16
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_lo_u32 v9, v9, v9
-; GCN-NEXT:    v_mul_lo_u32 v8, v8, v8
-; GCN-NEXT:    v_mul_lo_u32 v11, v11, v11
-; GCN-NEXT:    v_mul_lo_u32 v10, v10, v10
-; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:64
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_mul_lo_u32 v3, v31, v31
+; GCN-NEXT:    v_mul_lo_u32 v2, v30, v30
+; GCN-NEXT:    v_mul_lo_u32 v1, v29, v29
+; GCN-NEXT:    v_mul_lo_u32 v0, v28, v28
+; GCN-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3] offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    v_mul_lo_u32 v1, v25, v25
+; GCN-NEXT:    v_mul_lo_u32 v0, v24, v24
+; GCN-NEXT:    v_mul_lo_u32 v3, v27, v27
+; GCN-NEXT:    v_mul_lo_u32 v2, v26, v26
+; GCN-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3] offset:32
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_lo_u32 v11, v11, v11
-; GCN-NEXT:    v_mul_lo_u32 v10, v10, v10
-; GCN-NEXT:    v_mul_lo_u32 v9, v9, v9
-; GCN-NEXT:    v_mul_lo_u32 v8, v8, v8
-; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:64
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; GCN-NEXT:    s_endpgm
 ;
@@ -476,88 +475,87 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA
 ; EXACTCUTOFF:       ; %bb.0:
 ; EXACTCUTOFF-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; EXACTCUTOFF-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v16, 7, v0
+; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
 ; EXACTCUTOFF-NEXT:    ; kill: killed $sgpr0_sgpr1
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:32
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:16
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:112
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:96
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v13, v13, v13
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v12, v12, v12
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v15, v15, v15
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v14, v14, v14
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:32
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
-; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
+; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(1)
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v7, v7, v7
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v6, v6, v6
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:80
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v5, v5, v5
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v4, v4, v4
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:48
+; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:32
+; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(4)
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v11, v11, v11
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v10, v10, v10
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v9, v9, v9
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v8, v8, v8
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:112
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:96
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
 ; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:112
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:112
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:96
-; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:96
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1] offset:80
-; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v3, v3
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v2, v2
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v1, v1
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v0, v0
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3] offset:80
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:48
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
+; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(5)
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v15, v15, v15
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v14, v14, v14
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v13, v13, v13
+; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(4)
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v19, v19, v19
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v18, v18, v18
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v17, v17, v17
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v16, v16, v16
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v12, v12, v12
+; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(3)
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v23, v23, v23
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v22, v22, v22
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:80
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:64
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v21, v21, v21
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v20, v20, v20
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:48
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v7, v7, v7
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v6, v6, v6
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v5, v5, v5
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v4, v4, v4
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:48
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:16
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v9, v9, v9
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v8, v8, v8
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v11, v11, v11
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v10, v10, v10
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:16
-; EXACTCUTOFF-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:64
+; EXACTCUTOFF-NEXT:    s_nop 0
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v31, v31
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v30, v30
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v29, v29
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v28, v28
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3] offset:16
+; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(7)
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v1, v25, v25
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v0, v24, v24
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v3, v27, v27
+; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v2, v26, v26
+; EXACTCUTOFF-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3] offset:32
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000020) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000002) size(2) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_waitcnt vmcnt(0)
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v11, v11, v11
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v10, v10, v10
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v9, v9, v9
-; EXACTCUTOFF-NEXT:    v_mul_lo_u32 v8, v8, v8
-; EXACTCUTOFF-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:64
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000040) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #2
@@ -887,12 +885,44 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-NEXT:    ds_read_b128 a[12:15], v1 offset:48
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GCN-NEXT:    ds_read_b128 a[60:63], v1 offset:8304
+; GCN-NEXT:    ds_read_b128 a[56:59], v1 offset:8288
+; GCN-NEXT:    ds_read_b128 a[52:55], v1 offset:8272
+; GCN-NEXT:    ds_read_b128 a[48:51], v1 offset:8256
+; GCN-NEXT:    ds_read_b128 a[44:47], v1 offset:8240
+; GCN-NEXT:    ds_read_b128 a[40:43], v1 offset:8224
+; GCN-NEXT:    ds_read_b128 a[36:39], v1 offset:8208
+; GCN-NEXT:    ds_read_b128 a[32:35], v1 offset:8192
+; GCN-NEXT:    v_add_u32_e32 v4, 0x6000, v1
 ; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
+; GCN-NEXT:    ds_read_b128 a[92:95], v1 offset:24688
+; GCN-NEXT:    ds_read_b128 a[88:91], v1 offset:24672
+; GCN-NEXT:    ds_read_b128 a[84:87], v1 offset:24656
+; GCN-NEXT:    ds_read_b128 a[80:83], v1 offset:24640
+; GCN-NEXT:    ds_read_b128 a[76:79], v1 offset:24624
+; GCN-NEXT:    ds_read_b128 a[72:75], v1 offset:24608
+; GCN-NEXT:    ds_read_b128 a[68:71], v1 offset:24592
+; GCN-NEXT:    ds_read_b128 a[64:67], v1 offset:24576
+; GCN-NEXT:    ds_read_b128 a[124:127], v1 offset:49264
+; GCN-NEXT:    ds_read_b128 a[120:123], v1 offset:49248
+; GCN-NEXT:    ds_read_b128 a[116:119], v1 offset:49232
+; GCN-NEXT:    ds_read_b128 a[112:115], v1 offset:49216
+; GCN-NEXT:    ds_read_b128 a[108:111], v1 offset:49200
+; GCN-NEXT:    ds_read_b128 a[104:107], v1 offset:49184
+; GCN-NEXT:    ds_read_b128 a[100:103], v1 offset:49168
+; GCN-NEXT:    ds_read_b128 a[96:99], v1 offset:49152
+; GCN-NEXT:    ds_read_b128 a[156:159], v4 offset:57456
+; GCN-NEXT:    ds_read_b128 a[152:155], v4 offset:57440
+; GCN-NEXT:    ds_read_b128 a[148:151], v4 offset:57424
+; GCN-NEXT:    ds_read_b128 a[144:147], v4 offset:57408
+; GCN-NEXT:    ds_read_b128 a[128:131], v4 offset:57344
+; GCN-NEXT:    ds_read_b128 a[132:135], v4 offset:57360
+; GCN-NEXT:    ds_read_b128 a[136:139], v4 offset:57376
+; GCN-NEXT:    ds_read_b128 a[140:143], v4 offset:57392
 ; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:112
 ; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:96
 ; GCN-NEXT:    ds_write_b128 v0, a[20:23] offset:80
@@ -901,104 +931,64 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; GCN-NEXT:    ds_write_b128 v0, a[8:11] offset:32
 ; GCN-NEXT:    ds_write_b128 v0, a[4:7] offset:16
 ; GCN-NEXT:    ds_write_b128 v0, a[0:3]
-; GCN-NEXT:    ds_read_b128 a[28:31], v1 offset:8304
-; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:8288
-; GCN-NEXT:    ds_read_b128 a[20:23], v1 offset:8272
-; GCN-NEXT:    ds_read_b128 a[16:19], v1 offset:8256
-; GCN-NEXT:    ds_read_b128 a[12:15], v1 offset:8240
-; GCN-NEXT:    ds_read_b128 a[8:11], v1 offset:8224
-; GCN-NEXT:    ds_read_b128 a[4:7], v1 offset:8208
-; GCN-NEXT:    ds_read_b128 a[0:3], v1 offset:8192
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    ds_write_b128 v0, a[56:59] offset:8288
+; GCN-NEXT:    ds_write_b128 v0, a[60:63] offset:8304
+; GCN-NEXT:    ds_write_b128 v0, a[48:51] offset:8256
+; GCN-NEXT:    ds_write_b128 v0, a[52:55] offset:8272
+; GCN-NEXT:    ds_write_b128 v0, a[40:43] offset:8224
+; GCN-NEXT:    ds_write_b128 v0, a[44:47] offset:8240
+; GCN-NEXT:    ds_write_b128 v0, a[32:35] offset:8192
+; GCN-NEXT:    ds_write_b128 v0, a[36:39] offset:8208
+; GCN-NEXT:    s_waitcnt lgkmcnt(14)
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:8288
-; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:8304
-; GCN-NEXT:    ds_write_b128 v0, a[16:19] offset:8256
-; GCN-NEXT:    ds_write_b128 v0, a[20:23] offset:8272
-; GCN-NEXT:    ds_write_b128 v0, a[8:11] offset:8224
-; GCN-NEXT:    ds_write_b128 v0, a[12:15] offset:8240
-; GCN-NEXT:    ds_write_b128 v0, a[0:3] offset:8192
-; GCN-NEXT:    ds_write_b128 v0, a[4:7] offset:8208
-; GCN-NEXT:    ds_read_b128 a[28:31], v1 offset:24688
-; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:24672
-; GCN-NEXT:    ds_read_b128 a[20:23], v1 offset:24656
-; GCN-NEXT:    ds_read_b128 a[16:19], v1 offset:24640
-; GCN-NEXT:    ds_read_b128 a[12:15], v1 offset:24624
-; GCN-NEXT:    ds_read_b128 a[8:11], v1 offset:24608
-; GCN-NEXT:    ds_read_b128 a[4:7], v1 offset:24592
-; GCN-NEXT:    ds_read_b128 a[0:3], v1 offset:24576
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    ds_write_b128 v0, a[88:91] offset:16480
+; GCN-NEXT:    ds_write_b128 v0, a[92:95] offset:16496
+; GCN-NEXT:    ds_write_b128 v0, a[80:83] offset:16448
+; GCN-NEXT:    ds_write_b128 v0, a[84:87] offset:16464
+; GCN-NEXT:    ds_write_b128 v0, a[72:75] offset:16416
+; GCN-NEXT:    ds_write_b128 v0, a[76:79] offset:16432
+; GCN-NEXT:    ds_write_b128 v0, a[64:67] offset:16384
+; GCN-NEXT:    ds_write_b128 v0, a[68:71] offset:16400
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 2
-; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:16480
-; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:16496
-; GCN-NEXT:    ds_write_b128 v0, a[16:19] offset:16448
-; GCN-NEXT:    ds_write_b128 v0, a[20:23] offset:16464
-; GCN-NEXT:    ds_write_b128 v0, a[8:11] offset:16416
-; GCN-NEXT:    ds_write_b128 v0, a[12:15] offset:16432
-; GCN-NEXT:    ds_write_b128 v0, a[0:3] offset:16384
-; GCN-NEXT:    ds_write_b128 v0, a[4:7] offset:16400
-; GCN-NEXT:    ds_read_b128 a[28:31], v1 offset:49264
-; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:49248
-; GCN-NEXT:    ds_read_b128 a[20:23], v1 offset:49232
-; GCN-NEXT:    ds_read_b128 a[16:19], v1 offset:49216
-; GCN-NEXT:    ds_read_b128 a[12:15], v1 offset:49200
-; GCN-NEXT:    ds_read_b128 a[8:11], v1 offset:49184
-; GCN-NEXT:    ds_read_b128 a[4:7], v1 offset:49168
-; GCN-NEXT:    ds_read_b128 a[0:3], v1 offset:49152
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-NEXT:    v_add_u32_e32 v1, 0x6000, v1
+; GCN-NEXT:    ds_write_b128 v0, a[120:123] offset:24672
+; GCN-NEXT:    ds_write_b128 v0, a[124:127] offset:24688
+; GCN-NEXT:    ds_write_b128 v0, a[112:115] offset:24640
+; GCN-NEXT:    ds_write_b128 v0, a[116:119] offset:24656
+; GCN-NEXT:    ds_write_b128 v0, a[104:107] offset:24608
+; GCN-NEXT:    ds_write_b128 v0, a[108:111] offset:24624
+; GCN-NEXT:    ds_write_b128 v0, a[96:99] offset:24576
+; GCN-NEXT:    ds_write_b128 v0, a[100:103] offset:24592
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159]
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:24672
-; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:24688
-; GCN-NEXT:    ds_write_b128 v0, a[16:19] offset:24640
-; GCN-NEXT:    ds_write_b128 v0, a[20:23] offset:24656
-; GCN-NEXT:    ds_write_b128 v0, a[8:11] offset:24608
-; GCN-NEXT:    ds_write_b128 v0, a[12:15] offset:24624
-; GCN-NEXT:    ds_write_b128 v0, a[0:3] offset:24576
-; GCN-NEXT:    ds_write_b128 v0, a[4:7] offset:24592
-; GCN-NEXT:    ds_read_b128 a[28:31], v1 offset:57456
-; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:57440
-; GCN-NEXT:    ds_read_b128 a[20:23], v1 offset:57424
-; GCN-NEXT:    ds_read_b128 a[16:19], v1 offset:57408
-; GCN-NEXT:    ds_read_b128 a[0:3], v1 offset:57344
-; GCN-NEXT:    ds_read_b128 a[4:7], v1 offset:57360
-; GCN-NEXT:    ds_read_b128 a[8:11], v1 offset:57376
-; GCN-NEXT:    ds_read_b128 a[12:15], v1 offset:57392
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    ds_write_b128 v0, a[152:155] offset:32864
+; GCN-NEXT:    ds_write_b128 v0, a[156:159] offset:32880
+; GCN-NEXT:    ds_write_b128 v0, a[144:147] offset:32832
+; GCN-NEXT:    ds_write_b128 v0, a[148:151] offset:32848
+; GCN-NEXT:    ds_write_b128 v0, a[136:139] offset:32800
+; GCN-NEXT:    ds_write_b128 v0, a[140:143] offset:32816
+; GCN-NEXT:    ds_write_b128 v0, a[128:131] offset:32768
+; GCN-NEXT:    ds_write_b128 v0, a[132:135] offset:32784
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
-; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:32864
-; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:32880
-; GCN-NEXT:    ds_write_b128 v0, a[16:19] offset:32832
-; GCN-NEXT:    ds_write_b128 v0, a[20:23] offset:32848
-; GCN-NEXT:    ds_write_b128 v0, a[8:11] offset:32800
-; GCN-NEXT:    ds_write_b128 v0, a[12:15] offset:32816
-; GCN-NEXT:    ds_write_b128 v0, a[0:3] offset:32768
-; GCN-NEXT:    ds_write_b128 v0, a[4:7] offset:32784
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; GCN-NEXT:    s_endpgm
 ;
@@ -1021,12 +1011,44 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    ds_read_b128 a[12:15], v1 offset:48
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
 ; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[60:63], v1 offset:8304
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[56:59], v1 offset:8288
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[52:55], v1 offset:8272
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[48:51], v1 offset:8256
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[44:47], v1 offset:8240
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[40:43], v1 offset:8224
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[36:39], v1 offset:8208
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[32:35], v1 offset:8192
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v4, 0x6000, v1
 ; EXACTCUTOFF-NEXT:    v_add_u32_e32 v0, s1, v0
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 1
+; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
+; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[92:95], v1 offset:24688
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[88:91], v1 offset:24672
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[84:87], v1 offset:24656
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[80:83], v1 offset:24640
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[76:79], v1 offset:24624
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[72:75], v1 offset:24608
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[68:71], v1 offset:24592
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[64:67], v1 offset:24576
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[124:127], v1 offset:49264
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[120:123], v1 offset:49248
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[116:119], v1 offset:49232
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[112:115], v1 offset:49216
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[108:111], v1 offset:49200
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[104:107], v1 offset:49184
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[100:103], v1 offset:49168
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[96:99], v1 offset:49152
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[156:159], v4 offset:57456
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[152:155], v4 offset:57440
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[148:151], v4 offset:57424
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[144:147], v4 offset:57408
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[128:131], v4 offset:57344
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[132:135], v4 offset:57360
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[136:139], v4 offset:57376
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[140:143], v4 offset:57392
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[28:31] offset:112
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[24:27] offset:96
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[20:23] offset:80
@@ -1035,104 +1057,64 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[8:11] offset:32
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[4:7] offset:16
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[0:3]
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[28:31], v1 offset:8304
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[24:27], v1 offset:8288
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[20:23], v1 offset:8272
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[16:19], v1 offset:8256
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[12:15], v1 offset:8240
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[8:11], v1 offset:8224
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[4:7], v1 offset:8208
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[0:3], v1 offset:8192
-; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
 ; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v0, s1
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[56:59] offset:8288
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[60:63] offset:8304
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[48:51] offset:8256
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[52:55] offset:8272
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[40:43] offset:8224
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[44:47] offset:8240
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[32:35] offset:8192
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[36:39] offset:8208
+; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(14)
+; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_nop 7
 ; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 1
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[24:27] offset:8288
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[28:31] offset:8304
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[16:19] offset:8256
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[20:23] offset:8272
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[8:11] offset:8224
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[12:15] offset:8240
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[0:3] offset:8192
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[4:7] offset:8208
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[28:31], v1 offset:24688
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[24:27], v1 offset:24672
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[20:23], v1 offset:24656
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[16:19], v1 offset:24640
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[12:15], v1 offset:24624
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[8:11], v1 offset:24608
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[4:7], v1 offset:24592
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[0:3], v1 offset:24576
-; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; EXACTCUTOFF-NEXT:    s_nop 2
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[88:91] offset:16480
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[92:95] offset:16496
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[80:83] offset:16448
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[84:87] offset:16464
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[72:75] offset:16416
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[76:79] offset:16432
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[64:67] offset:16384
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[68:71] offset:16400
+; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_nop 7
 ; EXACTCUTOFF-NEXT:    s_nop 7
 ; EXACTCUTOFF-NEXT:    s_nop 2
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[24:27] offset:16480
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[28:31] offset:16496
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[16:19] offset:16448
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[20:23] offset:16464
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[8:11] offset:16416
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[12:15] offset:16432
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[0:3] offset:16384
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[4:7] offset:16400
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[28:31], v1 offset:49264
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[24:27], v1 offset:49248
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[20:23], v1 offset:49232
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[16:19], v1 offset:49216
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[12:15], v1 offset:49200
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[8:11], v1 offset:49184
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[4:7], v1 offset:49168
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[0:3], v1 offset:49152
-; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v1, 0x6000, v1
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[120:123] offset:24672
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[124:127] offset:24688
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[112:115] offset:24640
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[116:119] offset:24656
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[104:107] offset:24608
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[108:111] offset:24624
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[96:99] offset:24576
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[100:103] offset:24592
+; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159]
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_nop 7
 ; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 1
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[24:27] offset:24672
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[28:31] offset:24688
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[16:19] offset:24640
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[20:23] offset:24656
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[8:11] offset:24608
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[12:15] offset:24624
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[0:3] offset:24576
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[4:7] offset:24592
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[28:31], v1 offset:57456
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[24:27], v1 offset:57440
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[20:23], v1 offset:57424
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[16:19], v1 offset:57408
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[0:3], v1 offset:57344
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[4:7], v1 offset:57360
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[8:11], v1 offset:57376
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[12:15], v1 offset:57392
-; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; EXACTCUTOFF-NEXT:    s_nop 2
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[152:155] offset:32864
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[156:159] offset:32880
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[144:147] offset:32832
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[148:151] offset:32848
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[136:139] offset:32800
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[140:143] offset:32816
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[128:131] offset:32768
+; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[132:135] offset:32784
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 7
-; EXACTCUTOFF-NEXT:    s_nop 2
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[24:27] offset:32864
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[28:31] offset:32880
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[16:19] offset:32832
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[20:23] offset:32848
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[8:11] offset:32800
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[12:15] offset:32816
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[0:3] offset:32768
-; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[4:7] offset:32784
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
 ; EXACTCUTOFF-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll b/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll
index 190384255bf23..efece9d02950d 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll
@@ -1119,21 +1119,44 @@ define amdgpu_kernel void @kern_align32_global_ptr(ptr addrspace(1) align 1024 %
 }
 
 define amdgpu_kernel void @kern_noalias_global_ptr(ptr addrspace(1) noalias %ptr) #0 {
-; GCN-LABEL: @kern_noalias_global_ptr(
-; GCN-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; GCN-NEXT:    store volatile ptr addrspace(1) [[PTR:%.*]], ptr addrspace(1) poison, align 8
-; GCN-NEXT:    ret void
+; HSA-LABEL: @kern_noalias_global_ptr(
+; HSA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; HSA-NEXT:    store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META5:![0-9]+]]
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_noalias_global_ptr(
+; MESA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
+; MESA-NEXT:    store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META5:![0-9]+]]
+; MESA-NEXT:    ret void
 ;
   store volatile ptr addrspace(1) %ptr, ptr addrspace(1) poison
   ret void
 }
 
 define amdgpu_kernel void @kern_noalias_global_ptr_x2(ptr addrspace(1) noalias %ptr0, ptr addrspace(1) noalias %ptr1) #0 {
-; GCN-LABEL: @kern_noalias_global_ptr_x2(
-; GCN-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; GCN-NEXT:    store volatile ptr addrspace(1) [[PTR0:%.*]], ptr addrspace(1) poison, align 8
-; GCN-NEXT:    store volatile ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(1) poison, align 8
-; GCN-NEXT:    ret void
+; HSA-LABEL: @kern_noalias_global_ptr_x2(
+; HSA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[PTR0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 0
+; HSA-NEXT:    [[PTR0_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR0_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; HSA-NEXT:    [[PTR1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 8
+; HSA-NEXT:    [[PTR1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
+; HSA-NEXT:    store volatile ptr addrspace(1) [[PTR0_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META8:![0-9]+]]
+; HSA-NEXT:    store volatile ptr addrspace(1) [[PTR1_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META8]]
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @kern_noalias_global_ptr_x2(
+; MESA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[PTR0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[PTR0_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR0_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
+; MESA-NEXT:    [[PTR1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 44
+; MESA-NEXT:    [[PTR1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]]
+; MESA-NEXT:    store volatile ptr addrspace(1) [[PTR0_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META8:![0-9]+]]
+; MESA-NEXT:    store volatile ptr addrspace(1) [[PTR1_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META8]]
+; MESA-NEXT:    ret void
 ;
   store volatile ptr addrspace(1) %ptr0, ptr addrspace(1) poison
   store volatile ptr addrspace(1) %ptr1, ptr addrspace(1) poison
@@ -1855,10 +1878,24 @@ attributes #2 = { nounwind "target-cpu"="tahiti" }
 ; HSA: [[META2]] = !{i64 42}
 ; HSA: [[META3]] = !{i64 128}
 ; HSA: [[META4]] = !{i64 1024}
+; HSA: [[META5]] = !{[[META6:![0-9]+]]}
+; HSA: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]], !"kern_noalias_global_ptr: %ptr"}
+; HSA: [[META7]] = distinct !{[[META7]], !"kern_noalias_global_ptr"}
+; HSA: [[META8]] = !{[[META9:![0-9]+]], [[META11:![0-9]+]]}
+; HSA: [[META9]] = distinct !{[[META9]], [[META10:![0-9]+]], !"kern_noalias_global_ptr_x2: %ptr0"}
+; HSA: [[META10]] = distinct !{[[META10]], !"kern_noalias_global_ptr_x2"}
+; HSA: [[META11]] = distinct !{[[META11]], [[META10]], !"kern_noalias_global_ptr_x2: %ptr1"}
 ;.
 ; MESA: [[META0]] = !{}
 ; MESA: [[RNG1]] = !{i32 0, i32 8}
 ; MESA: [[META2]] = !{i64 42}
 ; MESA: [[META3]] = !{i64 128}
 ; MESA: [[META4]] = !{i64 1024}
+; MESA: [[META5]] = !{[[META6:![0-9]+]]}
+; MESA: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]], !"kern_noalias_global_ptr: %ptr"}
+; MESA: [[META7]] = distinct !{[[META7]], !"kern_noalias_global_ptr"}
+; MESA: [[META8]] = !{[[META9:![0-9]+]], [[META11:![0-9]+]]}
+; MESA: [[META9]] = distinct !{[[META9]], [[META10:![0-9]+]], !"kern_noalias_global_ptr_x2: %ptr0"}
+; MESA: [[META10]] = distinct !{[[META10]], !"kern_noalias_global_ptr_x2"}
+; MESA: [[META11]] = distinct !{[[META11]], [[META10]], !"kern_noalias_global_ptr_x2: %ptr1"}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
index 0ac3d652050d3..ba59b94b6d141 100644
--- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
@@ -7,26 +7,44 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr a
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[12:15], 0
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:4
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:8
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:8
+; SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:12
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v0
-; SDAG-NEXT:    v_mul_f32_e32 v1, v1, v1
-; SDAG-NEXT:    v_mul_f32_e32 v2, v2, v2
-; SDAG-NEXT:    v_mul_f32_e32 v3, v3, v3
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:12
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: buffers_dont_alias:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GISEL-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[12:15], 0
+; GISEL-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:4
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4
+; GISEL-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:8
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:8
+; GISEL-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:12
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v0
-; GISEL-NEXT:    v_mul_f32_e32 v1, v1, v1
-; GISEL-NEXT:    v_mul_f32_e32 v2, v2, v2
-; GISEL-NEXT:    v_mul_f32_e32 v3, v3, v3
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:12
 ; GISEL-NEXT:    s_endpgm
   %l0 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 0, i32 0, i32 0)
   %s0 = fmul float %l0, %l0
@@ -56,15 +74,26 @@ define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    s_and_b32 s5, s1, 0xffff
 ; SDAG-NEXT:    s_mov_b32 s4, s0
-; SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; SDAG-NEXT:    s_and_b32 s5, s3, 0xffff
-; SDAG-NEXT:    s_mov_b32 s4, s2
+; SDAG-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; SDAG-NEXT:    s_and_b32 s1, s3, 0xffff
+; SDAG-NEXT:    s_mov_b32 s0, s2
+; SDAG-NEXT:    s_mov_b32 s2, s6
+; SDAG-NEXT:    s_mov_b32 s3, s7
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SDAG-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:4
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v0
-; SDAG-NEXT:    v_mul_f32_e32 v1, v1, v1
-; SDAG-NEXT:    v_mul_f32_e32 v2, v2, v2
-; SDAG-NEXT:    v_mul_f32_e32 v3, v3, v3
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; SDAG-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:8
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
+; SDAG-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:12
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v0
+; SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:12
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: buffers_from_flat_dont_alias:
@@ -72,18 +101,29 @@ define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr
 ; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GISEL-NEXT:    s_mov_b32 s7, 0
 ; GISEL-NEXT:    s_mov_b32 s6, 16
+; GISEL-NEXT:    s_mov_b32 s10, s6
+; GISEL-NEXT:    s_mov_b32 s11, s7
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    s_and_b32 s5, s1, 0xffff
 ; GISEL-NEXT:    s_mov_b32 s4, s0
-; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GISEL-NEXT:    s_and_b32 s5, s3, 0xffff
-; GISEL-NEXT:    s_mov_b32 s4, s2
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GISEL-NEXT:    s_and_b32 s9, s3, 0xffff
+; GISEL-NEXT:    s_mov_b32 s8, s2
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:4
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:4
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:8
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:8
+; GISEL-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:12
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v0
-; GISEL-NEXT:    v_mul_f32_e32 v1, v1, v1
-; GISEL-NEXT:    v_mul_f32_e32 v2, v2, v2
-; GISEL-NEXT:    v_mul_f32_e32 v3, v3, v3
-; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GISEL-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:12
 ; GISEL-NEXT:    s_endpgm
   %a = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %a.flat, i16 0, i32 16, i32 0)
   %b = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %b.flat, i16 0, i32 16, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index ec065b4daa376..73438a7462531 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -647,13 +647,15 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1
 define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 %b) nounwind {
 ; GFX6-LABEL: s_sub_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_sub_u32 s0, s0, s2
-; GFX6-NEXT:    s_subb_u32 s1, s1, s3
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_sub_u32 s0, s2, s8
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    s_subb_u32 s1, s3, s9
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -661,41 +663,41 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64
 ;
 ; GFX8-LABEL: s_sub_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_sub_u32 s0, s0, s2
-; GFX8-NEXT:    s_subb_u32 s1, s1, s3
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    s_sub_u32 s0, s2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_subb_u32 s1, s3, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_sub_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sub_u32 s0, s0, s2
-; GFX9-NEXT:    s_subb_u32 s1, s1, s3
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX9-NEXT:    s_sub_u32 s2, s2, s6
+; GFX9-NEXT:    s_subb_u32 s3, s3, s7
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_sub_i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_sub_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    s_sub_nc_u64 s[2:3], s[2:3], s[4:5]
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_endpgm
   %result = sub i64 %a, %b
   store i64 %result, ptr addrspace(1) %out, align 8
@@ -740,12 +742,12 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s0
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_sub_i64:
@@ -832,14 +834,14 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v8, s0
+; GFX8-NEXT:    v_mov_b32_e32 v9, s1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v6
 ; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v4
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s0
-; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_sub_v2i64:

>From 7c2ef51a5284801d6342dbcea0bdfd027caab2e1 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Thu, 19 Jun 2025 04:14:51 +0100
Subject: [PATCH 2/3] Change comment style.

---
 llvm/lib/Transforms/Utils/InlineFunction.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index a56dc39e569c0..ffa23823b030a 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1394,11 +1394,11 @@ void llvm::addAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
 }
 
 void llvm::addAliasScopeMetadata(Function &F) {
-  addAliasScopeMetadataImpl(/* CB */ nullptr, &F, /* VMap */ nullptr,
+  addAliasScopeMetadataImpl(/*CB=*/ nullptr, &F, /*VMap=*/ nullptr,
                             F.getParent()->getDataLayout(),
-                            /* CalleeAAR */ nullptr,
-                            /* InlinedFunctionInfo */ nullptr,
-                            /* UseNoAliasIntrinsic */ false);
+                            /*CalleeAAR=*/ nullptr,
+                            /*InlinedFunctionInfo=*/ nullptr,
+                            /*UseNoAliasIntrinsic=*/ false);
 }
 
 static bool MayContainThrowingOrExitingCallAfterCB(CallBase *Begin,

>From bec81cabdba164952345f038f20261dcc578714c Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Thu, 19 Jun 2025 04:19:37 +0100
Subject: [PATCH 3/3] Formatting.

---
 llvm/lib/Transforms/Utils/InlineFunction.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index ffa23823b030a..377cdad452a2b 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1394,11 +1394,11 @@ void llvm::addAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
 }
 
 void llvm::addAliasScopeMetadata(Function &F) {
-  addAliasScopeMetadataImpl(/*CB=*/ nullptr, &F, /*VMap=*/ nullptr,
+  addAliasScopeMetadataImpl(/*CB=*/nullptr, &F, /*VMap=*/nullptr,
                             F.getParent()->getDataLayout(),
-                            /*CalleeAAR=*/ nullptr,
-                            /*InlinedFunctionInfo=*/ nullptr,
-                            /*UseNoAliasIntrinsic=*/ false);
+                            /*CalleeAAR=*/nullptr,
+                            /*InlinedFunctionInfo=*/nullptr,
+                            /*UseNoAliasIntrinsic=*/false);
 }
 
 static bool MayContainThrowingOrExitingCallAfterCB(CallBase *Begin,



More information about the llvm-commits mailing list