[llvm] [AMDGPU] Filter candidates of LiveRegOptimizer for profitable cases (PR #124624)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 27 12:57:26 PST 2025
https://github.com/choikwa created https://github.com/llvm/llvm-project/pull/124624
It is known that for vector whose element fits in i16 will be split and scalarized in SelectionDag's type legalizer
(see SIISelLowering::getPreferredVectorAction).
LRO attempts to undo the scalarizing of vectors across basic block boundary and shoehorn Values in VGPRs. LRO is beneficial for operations that natively work on illegal vector types to prevent flip-flopping between SGPR and VGPR. If we know that operations on vector will be split and scalarized, then we don't want to shoehorn them back to VGPR.
Operations that we know to work natively on illegal vector types usually come in the form of intrinsics (MFMA, DOT8), buffer store, shuffle, phi nodes to name a few.
>From 94334e44d32e90ae33a415fcc856094dbbaa2c38 Mon Sep 17 00:00:00 2001
From: choikwa <code.kchoi at gmail.com>
Date: Fri, 24 Jan 2025 02:21:38 -0500
Subject: [PATCH] [AMDGPU] Filter candidates of LiveRegOptimizer for profitable
cases
It is known that for vector whose element fits in i16 will be split
and scalarized in SelectionDag's type legalizer
(see SIISelLowering::getPreferredVectorAction).
LRO attempts to undo the scalarizing of vectors across basic block
boundary and shoehorn Values in VGPRs. LRO is beneficial for operations
that natively work on illegal vector types to prevent flip-flopping
between SGPR and VGPR. If we know that operations on vector will be
split and scalarized, then we don't want to shoehorn them back to VGPR.
Operations that we know to work natively on illegal vector types
usually come in the form of intrinsics (MFMA, DOT8), buffer store,
shuffle, phi nodes to name a few.
---
.../AMDGPU/AMDGPULateCodeGenPrepare.cpp | 57 +-
.../AMDGPU/GlobalISel/vni8-across-blocks.ll | 2210 +++++++++++++++--
...dagcomb-extract-vec-elt-different-sizes.ll | 39 +-
.../CodeGen/AMDGPU/extract-subvector-16bit.ll | 359 +--
llvm/test/CodeGen/AMDGPU/extract-subvector.ll | 51 +-
llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 36 +-
.../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 2044 +++++++++++++--
7 files changed, 4205 insertions(+), 591 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index f4e651ec477d30..d64951001d9cba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -14,6 +14,7 @@
#include "AMDGPU.h"
#include "AMDGPUTargetMachine.h"
+#include "AMDGPUTargetTransformInfo.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -45,6 +46,7 @@ class AMDGPULateCodeGenPrepare
Function &F;
const DataLayout &DL;
const GCNSubtarget &ST;
+ const TargetTransformInfo &TTI;
AssumptionCache *const AC;
UniformityInfo &UA;
@@ -53,8 +55,9 @@ class AMDGPULateCodeGenPrepare
public:
AMDGPULateCodeGenPrepare(Function &F, const GCNSubtarget &ST,
+ const TargetTransformInfo &TTI,
AssumptionCache *AC, UniformityInfo &UA)
- : F(F), DL(F.getDataLayout()), ST(ST), AC(AC), UA(UA) {}
+ : F(F), DL(F.getDataLayout()), ST(ST), TTI(TTI), AC(AC), UA(UA) {}
bool run();
bool visitInstruction(Instruction &) { return false; }
@@ -75,6 +78,8 @@ class LiveRegOptimizer {
Module &Mod;
const DataLayout &DL;
const GCNSubtarget &ST;
+ const TargetTransformInfo &TTI;
+
/// The scalar type to convert to
Type *const ConvertToScalar;
/// The set of visited Instructions
@@ -125,8 +130,43 @@ class LiveRegOptimizer {
return LK.first != TargetLoweringBase::TypeLegal;
}
- LiveRegOptimizer(Module &Mod, const GCNSubtarget &ST)
- : Mod(Mod), DL(Mod.getDataLayout()), ST(ST),
+ // Filtering based on operation or its cost.
+ // If an operation incurs high enough cost or natively work on
+ // vector of illegal type, ie. v2i8, then it makes sense to try
+ // to avoid scalarizing across BB.
+ bool shouldReplaceBasedOnOp(Instruction *II) {
+ // Ignore pseudos
+ if (II->isDebugOrPseudoInst())
+ return false;
+
+ // Instruction Cost
+ const auto Cost = TTI.getInstructionCost(II,
+ TargetTransformInfo::TargetCostKind::TCK_SizeAndLatency);
+ LLVM_DEBUG(
+ dbgs() << "shouldReplaceBasedOnOp: " <<
+ *II << " Cost=" << Cost << '\n';
+ );
+ if (Cost >= 8)
+ return true;
+
+ // Intrinsics - assume they natively handle illegal type
+ if (dyn_cast<IntrinsicInst>(II))
+ return true;
+
+ // Stores
+ if (dyn_cast<StoreInst>(II))
+ return true;
+
+ // Shuffles
+ if (dyn_cast<ShuffleVectorInst>(II))
+ return true;
+
+ return false;
+ }
+
+ LiveRegOptimizer(Module &Mod, const GCNSubtarget &ST,
+ const TargetTransformInfo &TTI)
+ : Mod(Mod), DL(Mod.getDataLayout()), ST(ST), TTI(TTI),
ConvertToScalar(Type::getInt32Ty(Mod.getContext())) {}
};
@@ -140,7 +180,7 @@ bool AMDGPULateCodeGenPrepare::run() {
// vectors to equivalent vectors of legal type (which are converted back
// before uses in subsequent blocks), to pack the bits into fewer physical
// registers (used in CopyToReg/CopyFromReg pairs).
- LiveRegOptimizer LRO(*F.getParent(), ST);
+ LiveRegOptimizer LRO(*F.getParent(), ST, TTI);
bool Changed = false;
@@ -259,6 +299,9 @@ bool LiveRegOptimizer::optimizeLiveType(
if (!shouldReplace(II->getType()))
continue;
+ if (!shouldReplaceBasedOnOp(II))
+ continue;
+
if (PHINode *Phi = dyn_cast<PHINode>(II)) {
PhiNodes.insert(Phi);
// Collect all the incoming values of problematic PHI nodes.
@@ -478,11 +521,12 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
PreservedAnalyses
AMDGPULateCodeGenPreparePass::run(Function &F, FunctionAnalysisManager &FAM) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
AssumptionCache &AC = FAM.getResult<AssumptionAnalysis>(F);
UniformityInfo &UI = FAM.getResult<UniformityInfoAnalysis>(F);
- bool Changed = AMDGPULateCodeGenPrepare(F, ST, &AC, UI).run();
+ bool Changed = AMDGPULateCodeGenPrepare(F, ST, TTI, &AC, UI).run();
if (!Changed)
return PreservedAnalyses::all();
@@ -518,13 +562,14 @@ bool AMDGPULateCodeGenPrepareLegacy::runOnFunction(Function &F) {
const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
const TargetMachine &TM = TPC.getTM<TargetMachine>();
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
AssumptionCache &AC =
getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
UniformityInfo &UI =
getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
- return AMDGPULateCodeGenPrepare(F, ST, &AC, UI).run();
+ return AMDGPULateCodeGenPrepare(F, ST, TTI, &AC, UI).run();
}
INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepareLegacy, DEBUG_TYPE,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
index 9c2fabce4bcdeb..96a167794dfb7a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -6,36 +6,28 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX906-NEXT: v_mov_b32_e32 v4, 8
-; GFX906-NEXT: v_mov_b32_e32 v5, 16
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dword v3, v2, s[0:1]
-; GFX906-NEXT: v_mov_b32_e32 v1, 0xff
+; GFX906-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: global_load_dword v1, v4, s[0:1]
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_and_b32_e32 v6, 0xff, v3
-; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT: v_or3_b32 v3, v6, v7, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB0_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dword v0, v2, s[2:3]
+; GFX906-NEXT: global_load_dword v1, v4, s[2:3]
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v0
-; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT: v_or3_b32 v3, v2, v3, v0
+; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX906-NEXT: .LBB0_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v3
-; GFX906-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX906-NEXT: v_and_b32_e32 v0, 0xff, v3
; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_mov_b32 s0, 0xffff
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX906-NEXT: v_and_b32_sdwa v1, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX906-NEXT: v_mov_b32_e32 v1, 0
; GFX906-NEXT: global_store_short v1, v0, s[6:7]
@@ -63,19 +55,34 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 2, v0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dword v1, v2, s[0:1]
+; GFX906-NEXT: global_load_dword v1, v5, s[0:1]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v2, 8, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v1
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB1_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dword v1, v2, s[2:3]
+; GFX906-NEXT: global_load_dword v1, v5, s[2:3]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v2, 8, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v1
; GFX906-NEXT: .LBB1_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX906-NEXT: v_mov_b32_e32 v5, 8
+; GFX906-NEXT: v_mov_b32_e32 v0, 0xff
+; GFX906-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v0, v1, v0, v2
+; GFX906-NEXT: v_and_b32_e32 v1, 0xff, v3
+; GFX906-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT: v_lshlrev_b32_e32 v2, 24, v4
+; GFX906-NEXT: v_or3_b32 v0, v0, v1, v2
+; GFX906-NEXT: v_mov_b32_e32 v1, 0
+; GFX906-NEXT: global_store_dword v1, v0, s[6:7]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -99,28 +106,30 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1]
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v6, s[0:1]
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 24, v1
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB2_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[2:3]
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v6, s[2:3]
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 24, v1
; GFX906-NEXT: .LBB2_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_mov_b32_e32 v4, 0
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v1
-; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v1
-; GFX906-NEXT: global_store_byte v4, v1, s[6:7]
-; GFX906-NEXT: global_store_byte v4, v0, s[6:7] offset:1
-; GFX906-NEXT: global_store_byte_d16_hi v4, v1, s[6:7] offset:2
-; GFX906-NEXT: global_store_byte v4, v3, s[6:7] offset:3
-; GFX906-NEXT: global_store_byte v4, v2, s[6:7] offset:4
+; GFX906-NEXT: v_mov_b32_e32 v0, 0
+; GFX906-NEXT: global_store_byte v0, v1, s[6:7]
+; GFX906-NEXT: global_store_byte v0, v3, s[6:7] offset:1
+; GFX906-NEXT: global_store_byte v0, v4, s[6:7] offset:2
+; GFX906-NEXT: global_store_byte v0, v5, s[6:7] offset:3
+; GFX906-NEXT: global_store_byte v0, v2, s[6:7] offset:4
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -144,19 +153,46 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX906-NEXT: v_lshlrev_b32_e32 v9, 3, v0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1]
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v9, s[0:1]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 24, v2
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB3_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[2:3]
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v9, s[2:3]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 24, v2
; GFX906-NEXT: .LBB3_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[6:7]
+; GFX906-NEXT: v_mov_b32_e32 v10, 8
+; GFX906-NEXT: v_mov_b32_e32 v9, 0xff
+; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v0, v1, v9, v0
+; GFX906-NEXT: v_and_b32_e32 v1, 0xff, v4
+; GFX906-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v5
+; GFX906-NEXT: v_or3_b32 v0, v0, v1, v3
+; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v1, v2, v9, v1
+; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v7
+; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v8
+; GFX906-NEXT: v_or3_b32 v1, v1, v2, v3
+; GFX906-NEXT: v_mov_b32_e32 v2, 0
+; GFX906-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -180,19 +216,70 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v5, 4, v0
+; GFX906-NEXT: v_lshlrev_b32_e32 v17, 4, v0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[0:1]
+; GFX906-NEXT: global_load_dwordx4 v[1:4], v17, s[0:1]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v10, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v12, 16, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v13, 24, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v15, 16, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v16, 24, v4
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB4_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[2:3]
+; GFX906-NEXT: global_load_dwordx4 v[1:4], v17, s[2:3]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v10, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v12, 16, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v13, 24, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v15, 16, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v16, 24, v4
; GFX906-NEXT: .LBB4_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[6:7]
+; GFX906-NEXT: v_mov_b32_e32 v18, 8
+; GFX906-NEXT: v_mov_b32_e32 v17, 0xff
+; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v18, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v0, v1, v17, v0
+; GFX906-NEXT: v_and_b32_e32 v1, 0xff, v6
+; GFX906-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 24, v7
+; GFX906-NEXT: v_or3_b32 v0, v0, v1, v5
+; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v18, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v1, v2, v17, v1
+; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v9
+; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 24, v10
+; GFX906-NEXT: v_or3_b32 v1, v1, v2, v5
+; GFX906-NEXT: v_lshlrev_b32_sdwa v2, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v2, v3, v17, v2
+; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v12
+; GFX906-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 24, v13
+; GFX906-NEXT: v_or3_b32 v2, v2, v3, v5
+; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v18, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v3, v4, v17, v3
+; GFX906-NEXT: v_and_b32_e32 v4, 0xff, v15
+; GFX906-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 24, v16
+; GFX906-NEXT: v_or3_b32 v3, v3, v4, v5
+; GFX906-NEXT: v_mov_b32_e32 v4, 0
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -216,23 +303,123 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v9, 5, v0
+; GFX906-NEXT: v_lshlrev_b32_e32 v32, 5, v0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[0:1]
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[0:1] offset:16
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v32, s[0:1]
+; GFX906-NEXT: global_load_dwordx4 v[1:4], v32, s[0:1] offset:16
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v31, 8, v5
+; GFX906-NEXT: v_lshrrev_b32_e32 v26, 16, v5
+; GFX906-NEXT: v_lshrrev_b32_e32 v27, 24, v5
+; GFX906-NEXT: v_lshrrev_b32_e32 v28, 8, v6
+; GFX906-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX906-NEXT: v_lshrrev_b32_e32 v30, 24, v6
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v7
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX906-NEXT: v_lshrrev_b32_e32 v10, 24, v7
+; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v8
+; GFX906-NEXT: v_lshrrev_b32_e32 v12, 16, v8
+; GFX906-NEXT: v_lshrrev_b32_e32 v13, 24, v8
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v16, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v18, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v19, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v20, 8, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v21, 16, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v22, 24, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v23, 8, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v24, 16, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v25, 24, v4
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB5_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[2:3]
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[2:3] offset:16
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v32, s[2:3]
+; GFX906-NEXT: global_load_dwordx4 v[1:4], v32, s[2:3] offset:16
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v31, 8, v5
+; GFX906-NEXT: v_lshrrev_b32_e32 v26, 16, v5
+; GFX906-NEXT: v_lshrrev_b32_e32 v27, 24, v5
+; GFX906-NEXT: v_lshrrev_b32_e32 v28, 8, v6
+; GFX906-NEXT: v_lshrrev_b32_e32 v29, 16, v6
+; GFX906-NEXT: v_lshrrev_b32_e32 v30, 24, v6
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v7
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX906-NEXT: v_lshrrev_b32_e32 v10, 24, v7
+; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v8
+; GFX906-NEXT: v_lshrrev_b32_e32 v12, 16, v8
+; GFX906-NEXT: v_lshrrev_b32_e32 v13, 24, v8
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v16, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v18, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v19, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v20, 8, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v21, 16, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v22, 24, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v23, 8, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v24, 16, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v25, 24, v4
; GFX906-NEXT: .LBB5_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[6:7]
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: global_store_dwordx4 v0, v[5:8], s[6:7] offset:16
+; GFX906-NEXT: v_mov_b32_e32 v32, 8
+; GFX906-NEXT: v_mov_b32_e32 v33, 0xff
+; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v0, v7, v33, v0
+; GFX906-NEXT: v_and_b32_e32 v7, 0xff, v9
+; GFX906-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT: v_lshlrev_b32_e32 v9, 24, v10
+; GFX906-NEXT: v_or3_b32 v7, v0, v7, v9
+; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v32, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v0, v8, v33, v0
+; GFX906-NEXT: v_and_b32_e32 v8, 0xff, v12
+; GFX906-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX906-NEXT: v_lshlrev_b32_e32 v9, 24, v13
+; GFX906-NEXT: v_or3_b32 v8, v0, v8, v9
+; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v0, v1, v33, v0
+; GFX906-NEXT: v_and_b32_e32 v1, 0xff, v15
+; GFX906-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT: v_lshlrev_b32_e32 v9, 24, v16
+; GFX906-NEXT: v_or3_b32 v0, v0, v1, v9
+; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v32, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v1, v2, v33, v1
+; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v18
+; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX906-NEXT: v_lshlrev_b32_e32 v9, 24, v19
+; GFX906-NEXT: v_or3_b32 v1, v1, v2, v9
+; GFX906-NEXT: v_lshlrev_b32_sdwa v2, v32, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v2, v3, v33, v2
+; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v21
+; GFX906-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX906-NEXT: v_lshlrev_b32_e32 v9, 24, v22
+; GFX906-NEXT: v_lshlrev_b32_sdwa v28, v32, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_or3_b32 v2, v2, v3, v9
+; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v32, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_lshlrev_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_b32_e32 v26, 0xff, v26
+; GFX906-NEXT: v_and_or_b32 v6, v6, v33, v28
+; GFX906-NEXT: v_and_b32_e32 v28, 0xff, v29
+; GFX906-NEXT: v_and_or_b32 v3, v4, v33, v3
+; GFX906-NEXT: v_and_b32_e32 v4, 0xff, v24
+; GFX906-NEXT: v_and_or_b32 v5, v5, v33, v31
+; GFX906-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX906-NEXT: v_lshlrev_b32_e32 v27, 24, v27
+; GFX906-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX906-NEXT: v_lshlrev_b32_e32 v29, 24, v30
+; GFX906-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX906-NEXT: v_lshlrev_b32_e32 v9, 24, v25
+; GFX906-NEXT: v_or3_b32 v5, v5, v26, v27
+; GFX906-NEXT: v_or3_b32 v6, v6, v28, v29
+; GFX906-NEXT: v_or3_b32 v3, v3, v4, v9
+; GFX906-NEXT: v_mov_b32_e32 v4, 0
+; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[6:7]
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -261,150 +448,1660 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1]
+; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:16
+; GFX906-NEXT: global_load_dwordx4 v[29:32], v4, s[0:1] offset:32
+; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
; GFX906-NEXT: s_mov_b32 s14, -1
; GFX906-NEXT: s_mov_b32 s15, 0xe00000
; GFX906-NEXT: s_add_u32 s12, s12, s11
; GFX906-NEXT: s_addc_u32 s13, s13, 0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: buffer_store_dword v5, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v5
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:780 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v5
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:692 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v6
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:768 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:712 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v6
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:716 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v7
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:772 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v7
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:736 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v7
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:740 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v8
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:776 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v8
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:744 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v8
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:748 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(13)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v9
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:696 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v9
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:564 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v9
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:568 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v10
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:700 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v10
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:572 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v10
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:576 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v11
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:704 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v11
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:588 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v11
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:592 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v12
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:708 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v12
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:596 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
+; GFX906-NEXT: buffer_store_dword v10, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v11, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v12, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v12
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:600 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(28)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v29
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:752 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v29
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:636 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v29
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:640 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v30
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:756 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v30
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:668 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v30
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:672 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v31
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:760 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v31
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:676 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v31
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:680 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v32
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:764 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v32
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:684 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v32
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:688 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(39)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v13
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:652 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v13
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:500 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v13
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:504 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v14
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:656 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v14
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:524 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v14
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:528 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v15
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:660 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v15
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:548 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v15
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:552 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v16
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:664 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v16
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:556 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v16
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:560 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] offset:64
; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
-; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
-; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[0:1] offset:64
-; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[0:1] offset:80
-; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[0:1] offset:96
-; GFX906-NEXT: global_load_dwordx4 v[29:32], v4, s[0:1] offset:112
-; GFX906-NEXT: global_load_dwordx4 v[33:36], v4, s[0:1] offset:128
-; GFX906-NEXT: global_load_dwordx4 v[37:40], v4, s[0:1] offset:144
-; GFX906-NEXT: global_load_dwordx4 v[41:44], v4, s[0:1] offset:160
-; GFX906-NEXT: global_load_dwordx4 v[45:48], v4, s[0:1] offset:176
-; GFX906-NEXT: global_load_dwordx4 v[49:52], v4, s[0:1] offset:192
-; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[0:1] offset:208
-; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[0:1] offset:224
-; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] offset:240
+; GFX906-NEXT: global_load_dwordx4 v[45:48], v4, s[0:1] offset:80
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v0
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:720 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:580 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v0
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:584 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v1
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:724 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:604 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v1
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:608 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:728 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v2
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:628 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v2
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:632 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v3
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:732 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:644 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v3
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:648 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v45
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:612 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v45
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:468 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v45
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:472 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v46
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:616 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v46
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:476 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v46
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:480 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v47
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:620 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v47
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:508 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v47
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:512 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v48
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:624 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v48
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:516 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v48
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:520 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[0:1] offset:96
+; GFX906-NEXT: global_load_dwordx4 v[41:44], v4, s[0:1] offset:112
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v57
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:532 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v57
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:404 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v57
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:408 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v58
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:536 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v58
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:428 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v58
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:432 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v59
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:540 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v59
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:452 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v59
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:456 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v60
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:544 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v60
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:460 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v60
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:464 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v41
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:436 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v41
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:308 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v41
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:312 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v42
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:440 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v42
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:332 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v42
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:336 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v43
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:444 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v43
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:356 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v43
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:360 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v44
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:448 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v44
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:364 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v44
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:368 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[0:1] offset:128
+; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[0:1] offset:144
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v53
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:484 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v53
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:372 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v53
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:376 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v54
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:488 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v54
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:380 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v54
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:384 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v55
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:492 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v55
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:412 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v55
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:416 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v56
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:496 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v56
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:420 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v56
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:424 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v25
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:388 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v25
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:276 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v25
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:280 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v26
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:392 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v26
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:284 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v26
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:288 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v27
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:396 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v27
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:316 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v27
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:320 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v28
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:400 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v28
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:324 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v28
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:328 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[49:52], v4, s[0:1] offset:160
+; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[0:1] offset:176
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v49
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:340 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v49
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:164 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v49
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:168 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v50
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:344 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v50
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:236 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v50
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:240 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v51
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:348 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v51
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:260 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v51
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:264 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v52
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:352 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v52
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:268 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v52
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:272 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v21
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:244 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v21
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:80 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v21
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v22
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:248 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v22
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v22
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v23
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:252 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v23
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v23
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v24
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:256 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v24
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:52 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v24
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[33:36], v4, s[0:1] offset:192
+; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[0:1] offset:208
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v33
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:292 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v33
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:60 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v33
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:64 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v34
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:296 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v34
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:108 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v34
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:112 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v35
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:300 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v35
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:172 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v35
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:176 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v36
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:304 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v36
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:228 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v36
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:232 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v17
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:116 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v17
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:120 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v17
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:124 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v18
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:128 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v18
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:132 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v18
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:136 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v19
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:140 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v19
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:144 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v19
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:148 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v20
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:152 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v20
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:156 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v20
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:160 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[37:40], v4, s[0:1] offset:224
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:240
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v37
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:180 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v37
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:184 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v37
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:188 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v38
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:192 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v38
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:196 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v38
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:200 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v39
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:204 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v39
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:208 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v39
+; GFX906-NEXT: s_waitcnt vmcnt(8)
+; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v9
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:212 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v40
+; GFX906-NEXT: buffer_store_dword v62, off, s[12:15], 0 offset:68 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v11
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:216 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v40
+; GFX906-NEXT: buffer_store_dword v62, off, s[12:15], 0 offset:84 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v11
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:220 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v40
+; GFX906-NEXT: buffer_store_dword v62, off, s[12:15], 0 offset:96 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v11
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:224 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v9
+; GFX906-NEXT: buffer_store_dword v62, off, s[12:15], 0 offset:92 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v12
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:88 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v63, 24, v9
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v10
+; GFX906-NEXT: buffer_store_dword v62, off, s[12:15], 0 offset:104 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v12
+; GFX906-NEXT: buffer_store_dword v63, off, s[12:15], 0 offset:72 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v63, 8, v10
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:76 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v10
+; GFX906-NEXT: buffer_store_dword v62, off, s[12:15], 0 offset:100 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_store_dword v10, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v11, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v12, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v5
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v12
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB6_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
-; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[2:3] offset:16
-; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[2:3] offset:32
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[2:3]
+; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
+; GFX906-NEXT: global_load_dwordx4 v[29:32], v4, s[2:3] offset:32
; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[2:3] offset:48
-; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[2:3] offset:64
-; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[2:3] offset:80
-; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[2:3] offset:96
-; GFX906-NEXT: global_load_dwordx4 v[29:32], v4, s[2:3] offset:112
-; GFX906-NEXT: global_load_dwordx4 v[33:36], v4, s[2:3] offset:128
-; GFX906-NEXT: global_load_dwordx4 v[37:40], v4, s[2:3] offset:144
-; GFX906-NEXT: global_load_dwordx4 v[41:44], v4, s[2:3] offset:160
-; GFX906-NEXT: global_load_dwordx4 v[45:48], v4, s[2:3] offset:176
-; GFX906-NEXT: global_load_dwordx4 v[49:52], v4, s[2:3] offset:192
-; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[2:3] offset:208
-; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[2:3] offset:224
-; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:240
-; GFX906-NEXT: .LBB6_2: ; %bb.2
-; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v5
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:780 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v5
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:692 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v6
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:768 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v6
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:712 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v6
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:716 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v7
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:772 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:736 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v7
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:740 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v8
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:776 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v8
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:744 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v8
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:748 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(13)
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v0
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:696 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:564 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v0
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:568 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v1
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:700 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:572 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v1
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:576 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:704 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v2
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:588 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v2
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:592 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v3
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:708 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:596 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
; GFX906-NEXT: s_nop 0
; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
-; GFX906-NEXT: v_mov_b32_e32 v0, v57
-; GFX906-NEXT: v_mov_b32_e32 v1, v58
-; GFX906-NEXT: v_mov_b32_e32 v2, v59
-; GFX906-NEXT: v_mov_b32_e32 v3, v60
-; GFX906-NEXT: v_mov_b32_e32 v60, v56
-; GFX906-NEXT: v_mov_b32_e32 v59, v55
-; GFX906-NEXT: v_mov_b32_e32 v58, v54
-; GFX906-NEXT: v_mov_b32_e32 v57, v53
-; GFX906-NEXT: v_mov_b32_e32 v56, v52
-; GFX906-NEXT: v_mov_b32_e32 v55, v51
-; GFX906-NEXT: v_mov_b32_e32 v54, v50
-; GFX906-NEXT: v_mov_b32_e32 v53, v49
-; GFX906-NEXT: v_mov_b32_e32 v52, v48
-; GFX906-NEXT: v_mov_b32_e32 v51, v47
-; GFX906-NEXT: v_mov_b32_e32 v50, v46
-; GFX906-NEXT: v_mov_b32_e32 v49, v45
-; GFX906-NEXT: v_mov_b32_e32 v48, v44
-; GFX906-NEXT: v_mov_b32_e32 v47, v43
-; GFX906-NEXT: v_mov_b32_e32 v46, v42
-; GFX906-NEXT: v_mov_b32_e32 v45, v41
-; GFX906-NEXT: v_mov_b32_e32 v44, v40
-; GFX906-NEXT: v_mov_b32_e32 v43, v39
-; GFX906-NEXT: v_mov_b32_e32 v42, v38
-; GFX906-NEXT: v_mov_b32_e32 v41, v37
-; GFX906-NEXT: v_mov_b32_e32 v40, v36
-; GFX906-NEXT: v_mov_b32_e32 v39, v35
-; GFX906-NEXT: v_mov_b32_e32 v38, v34
-; GFX906-NEXT: v_mov_b32_e32 v37, v33
-; GFX906-NEXT: v_mov_b32_e32 v36, v32
-; GFX906-NEXT: v_mov_b32_e32 v35, v31
-; GFX906-NEXT: v_mov_b32_e32 v34, v30
-; GFX906-NEXT: v_mov_b32_e32 v33, v29
-; GFX906-NEXT: v_mov_b32_e32 v32, v28
-; GFX906-NEXT: v_mov_b32_e32 v31, v27
-; GFX906-NEXT: v_mov_b32_e32 v30, v26
-; GFX906-NEXT: v_mov_b32_e32 v29, v25
-; GFX906-NEXT: v_mov_b32_e32 v28, v24
-; GFX906-NEXT: v_mov_b32_e32 v27, v23
-; GFX906-NEXT: v_mov_b32_e32 v26, v22
-; GFX906-NEXT: v_mov_b32_e32 v25, v21
-; GFX906-NEXT: v_mov_b32_e32 v24, v20
-; GFX906-NEXT: v_mov_b32_e32 v23, v19
-; GFX906-NEXT: v_mov_b32_e32 v22, v18
-; GFX906-NEXT: v_mov_b32_e32 v21, v17
-; GFX906-NEXT: v_mov_b32_e32 v20, v16
-; GFX906-NEXT: v_mov_b32_e32 v19, v15
-; GFX906-NEXT: v_mov_b32_e32 v18, v14
-; GFX906-NEXT: v_mov_b32_e32 v17, v13
-; GFX906-NEXT: v_mov_b32_e32 v16, v12
-; GFX906-NEXT: v_mov_b32_e32 v15, v11
-; GFX906-NEXT: v_mov_b32_e32 v14, v10
-; GFX906-NEXT: v_mov_b32_e32 v13, v9
-; GFX906-NEXT: v_mov_b32_e32 v12, v8
-; GFX906-NEXT: v_mov_b32_e32 v11, v7
-; GFX906-NEXT: v_mov_b32_e32 v10, v6
-; GFX906-NEXT: v_mov_b32_e32 v9, v5
+; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v5
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v3
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:600 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(28)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v29
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:752 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v29
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:636 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v29
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:640 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v30
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:756 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v30
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:668 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v30
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:672 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v31
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:760 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v31
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:676 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v31
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:680 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v32
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:764 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v32
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:684 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v32
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:688 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(39)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v13
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:652 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v13
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:500 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v13
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:504 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v14
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:656 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v14
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:524 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v14
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:528 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v15
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:660 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v15
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:548 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v15
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:552 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v16
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:664 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v16
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:556 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v16
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:560 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:64
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: global_load_dwordx4 v[45:48], v4, s[2:3] offset:80
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v0
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:720 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:580 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v0
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:584 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v1
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:724 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:604 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v1
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:608 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:728 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v2
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:628 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v2
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:632 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v3
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:732 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:644 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v3
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:648 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v45
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:612 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v45
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:468 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v45
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:472 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v46
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:616 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v46
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:476 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v46
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:480 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v47
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:620 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v47
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:508 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v47
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:512 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v48
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:624 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v48
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:516 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v48
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:520 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[2:3] offset:96
+; GFX906-NEXT: global_load_dwordx4 v[41:44], v4, s[2:3] offset:112
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v57
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:532 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v57
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:404 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v57
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:408 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v58
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:536 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v58
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:428 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v58
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:432 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v59
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:540 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v59
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:452 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v59
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:456 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v60
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:544 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v60
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:460 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v60
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:464 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v41
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:436 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v41
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:308 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v41
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:312 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v42
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:440 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v42
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:332 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v42
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:336 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v43
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:444 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v43
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:356 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v43
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:360 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v44
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:448 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v44
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:364 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v44
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:368 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[2:3] offset:128
+; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[2:3] offset:144
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v53
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:484 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v53
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:372 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v53
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:376 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v54
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:488 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v54
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:380 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v54
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:384 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v55
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:492 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v55
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:412 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v55
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:416 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v56
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:496 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v56
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:420 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v56
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:424 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v25
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:388 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v25
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:276 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v25
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:280 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v26
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:392 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v26
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:284 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v26
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:288 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v27
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:396 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v27
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:316 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v27
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:320 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v28
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:400 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v28
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:324 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v28
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:328 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[49:52], v4, s[2:3] offset:160
+; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[2:3] offset:176
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v49
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:340 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v49
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:164 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v49
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:168 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v50
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:344 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v50
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:236 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v50
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:240 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v51
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:348 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v51
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:260 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v51
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:264 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v52
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:352 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v52
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:268 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v52
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:272 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v21
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:244 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v21
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:80 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v21
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v22
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:248 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v22
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v22
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v23
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:252 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v23
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v23
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v24
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:256 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v24
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:52 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v24
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[33:36], v4, s[2:3] offset:192
+; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[2:3] offset:208
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v33
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:292 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v33
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:60 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v33
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:64 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v34
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:296 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v34
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:108 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v34
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:112 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v35
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:300 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v35
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:172 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v35
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:176 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v36
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:304 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v36
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:228 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v36
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:232 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v17
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:116 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v17
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:120 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v17
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:124 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v18
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:128 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v18
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:132 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v18
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:136 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v19
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:140 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v19
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:144 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v19
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:148 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v20
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:152 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v20
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:156 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v20
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:160 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[37:40], v4, s[2:3] offset:224
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[2:3] offset:240
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 8, v37
+; GFX906-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:180 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v37
+; GFX906-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:184 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v37
+; GFX906-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:188 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 8, v38
+; GFX906-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:192 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v38
+; GFX906-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:196 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v38
+; GFX906-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:200 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 8, v39
+; GFX906-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:204 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v39
+; GFX906-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:208 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v39
+; GFX906-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:212 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 8, v40
+; GFX906-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:216 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v40
+; GFX906-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:220 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v40
+; GFX906-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:224 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 8, v9
+; GFX906-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:68 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v9
+; GFX906-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:88 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v9
+; GFX906-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:72 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v10
+; GFX906-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:76 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 8, v11
+; GFX906-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:84 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v11
+; GFX906-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:96 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v11
+; GFX906-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:92 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 8, v12
+; GFX906-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:104 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v12
+; GFX906-NEXT: v_lshrrev_b32_e32 v63, 8, v10
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v10
+; GFX906-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:100 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v9, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_store_dword v10, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v11, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v12, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v12
+; GFX906-NEXT: .LBB6_2: ; %bb.2
+; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:792 ; 4-byte Folded Spill
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:796 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:800 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:804 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v63, off, s[12:15], 0 offset:788 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:784 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:780 ; 4-byte Folded Reload
+; GFX906-NEXT: v_mov_b32_e32 v4, 8
+; GFX906-NEXT: v_mov_b32_e32 v63, 0xff
+; GFX906-NEXT: v_mov_b32_e32 v18, v16
+; GFX906-NEXT: v_mov_b32_e32 v17, v15
+; GFX906-NEXT: v_mov_b32_e32 v16, v14
+; GFX906-NEXT: v_mov_b32_e32 v15, v13
+; GFX906-NEXT: v_mov_b32_e32 v19, v9
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v61, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:768 ; 4-byte Folded Reload
+; GFX906-NEXT: v_and_or_b32 v5, v5, v63, v61
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v61, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:772 ; 4-byte Folded Reload
+; GFX906-NEXT: v_and_or_b32 v6, v6, v63, v61
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v61, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:776 ; 4-byte Folded Reload
+; GFX906-NEXT: v_and_or_b32 v7, v7, v63, v61
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v61, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:692 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:712 ; 4-byte Folded Reload
+; GFX906-NEXT: v_and_or_b32 v8, v8, v63, v61
+; GFX906-NEXT: v_and_b32_e32 v61, 0xff, v62
+; GFX906-NEXT: v_lshlrev_b32_e32 v61, 16, v61
+; GFX906-NEXT: s_waitcnt vmcnt(5)
+; GFX906-NEXT: v_lshlrev_b32_e32 v62, 24, v10
+; GFX906-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:716 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT: v_or3_b32 v5, v5, v61, v62
+; GFX906-NEXT: v_mov_b32_e32 v61, 0
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT: v_or3_b32 v6, v6, v9, v10
+; GFX906-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:736 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:740 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT: v_or3_b32 v7, v7, v9, v10
+; GFX906-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:744 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:748 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT: v_or3_b32 v8, v8, v9, v10
+; GFX906-NEXT: global_store_dwordx4 v61, v[5:8], s[6:7]
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:696 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:700 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:564 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:568 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:704 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:708 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(5)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v5, v11, v63, v5
+; GFX906-NEXT: s_waitcnt vmcnt(4)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v6, v12, v63, v6
+; GFX906-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:572 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:576 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(5)
+; GFX906-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT: s_waitcnt vmcnt(4)
+; GFX906-NEXT: v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT: v_or3_b32 v5, v5, v9, v10
+; GFX906-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:588 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:592 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(5)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v7, v13, v63, v7
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX906-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; GFX906-NEXT: v_or3_b32 v6, v6, v11, v12
+; GFX906-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:668 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT: v_or3_b32 v7, v7, v9, v10
+; GFX906-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:596 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:600 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:672 ; 4-byte Folded Reload
+; GFX906-NEXT: v_and_or_b32 v8, v14, v63, v8
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX906-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT: v_or3_b32 v8, v8, v9, v10
+; GFX906-NEXT: global_store_dwordx4 v61, v[5:8], s[6:7] offset:16
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:752 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; GFX906-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:760 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:636 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:640 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:756 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:764 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(5)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v5, v29, v63, v5
+; GFX906-NEXT: s_waitcnt vmcnt(4)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT: v_or3_b32 v5, v5, v9, v10
+; GFX906-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:676 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:680 ; 4-byte Folded Reload
+; GFX906-NEXT: v_and_or_b32 v7, v31, v63, v7
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v6, v30, v63, v6
+; GFX906-NEXT: v_or3_b32 v6, v6, v11, v12
+; GFX906-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:524 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT: v_or3_b32 v7, v7, v9, v10
+; GFX906-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:684 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:688 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v8, v32, v63, v8
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT: v_or3_b32 v8, v8, v9, v10
+; GFX906-NEXT: global_store_dwordx4 v61, v[5:8], s[6:7] offset:32
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:652 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:660 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:500 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:504 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:528 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:656 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:664 ; 4-byte Folded Reload
+; GFX906-NEXT: v_and_b32_e32 v11, 0xff, v11
+; GFX906-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX906-NEXT: s_waitcnt vmcnt(6)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v5, v15, v63, v5
+; GFX906-NEXT: s_waitcnt vmcnt(5)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: s_waitcnt vmcnt(4)
+; GFX906-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT: v_or3_b32 v5, v5, v9, v10
+; GFX906-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:548 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:552 ; 4-byte Folded Reload
+; GFX906-NEXT: v_and_or_b32 v7, v17, v63, v7
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v6, v16, v63, v6
+; GFX906-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; GFX906-NEXT: v_or3_b32 v6, v6, v11, v12
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT: v_or3_b32 v7, v7, v9, v10
+; GFX906-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:556 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:560 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v8, v18, v63, v8
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX906-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_e32 v10, 24, v10
+; GFX906-NEXT: v_or3_b32 v8, v8, v9, v10
+; GFX906-NEXT: global_store_dwordx4 v61, v[5:8], s[6:7] offset:48
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:720 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v0, v0, v63, v5
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:724 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:584 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:604 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:608 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v1, v1, v63, v5
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:728 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT: v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:476 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v2, v2, v63, v5
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:732 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v3, v3, v63, v5
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:580 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:628 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:632 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:644 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:648 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT: global_store_dwordx4 v61, v[0:3], s[6:7] offset:64
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:612 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:620 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:468 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:472 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:480 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:616 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:624 ; 4-byte Folded Reload
+; GFX906-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT: s_waitcnt vmcnt(6)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v0, v45, v63, v0
+; GFX906-NEXT: s_waitcnt vmcnt(5)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: s_waitcnt vmcnt(4)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:508 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:512 ; 4-byte Folded Reload
+; GFX906-NEXT: v_and_or_b32 v2, v47, v63, v2
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v1, v46, v63, v1
+; GFX906-NEXT: v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT: v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:428 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:516 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:520 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v3, v48, v63, v3
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT: global_store_dwordx4 v61, v[0:3], s[6:7] offset:80
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:532 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:540 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:404 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:408 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:432 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:536 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:544 ; 4-byte Folded Reload
+; GFX906-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT: s_waitcnt vmcnt(6)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v0, v57, v63, v0
+; GFX906-NEXT: s_waitcnt vmcnt(5)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: s_waitcnt vmcnt(4)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:452 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:456 ; 4-byte Folded Reload
+; GFX906-NEXT: v_and_or_b32 v2, v59, v63, v2
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v1, v58, v63, v1
+; GFX906-NEXT: v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT: v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:332 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:460 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:464 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v3, v60, v63, v3
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT: global_store_dwordx4 v61, v[0:3], s[6:7] offset:96
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:436 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:440 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:308 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:312 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:336 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:444 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:448 ; 4-byte Folded Reload
+; GFX906-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT: s_waitcnt vmcnt(6)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v0, v41, v63, v0
+; GFX906-NEXT: s_waitcnt vmcnt(5)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: s_waitcnt vmcnt(4)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:356 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:360 ; 4-byte Folded Reload
+; GFX906-NEXT: v_and_or_b32 v1, v42, v63, v1
+; GFX906-NEXT: s_waitcnt vmcnt(4)
+; GFX906-NEXT: v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT: v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:380 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(4)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v2, v43, v63, v2
+; GFX906-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:384 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(4)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v3, v44, v63, v3
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:364 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:368 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT: global_store_dwordx4 v61, v[0:3], s[6:7] offset:112
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:484 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:492 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:372 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:376 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:488 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:496 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(5)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v0, v53, v63, v0
+; GFX906-NEXT: s_waitcnt vmcnt(4)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:412 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:416 ; 4-byte Folded Reload
+; GFX906-NEXT: v_and_or_b32 v2, v55, v63, v2
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v1, v54, v63, v1
+; GFX906-NEXT: v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:284 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:420 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:424 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v3, v56, v63, v3
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT: global_store_dwordx4 v61, v[0:3], s[6:7] offset:128
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:388 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:396 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:276 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:280 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:288 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:392 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:400 ; 4-byte Folded Reload
+; GFX906-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT: s_waitcnt vmcnt(6)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v0, v25, v63, v0
+; GFX906-NEXT: s_waitcnt vmcnt(5)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: s_waitcnt vmcnt(4)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:316 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:320 ; 4-byte Folded Reload
+; GFX906-NEXT: v_and_or_b32 v2, v27, v63, v2
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v1, v26, v63, v1
+; GFX906-NEXT: v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT: v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:236 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:324 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:328 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v3, v28, v63, v3
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT: global_store_dwordx4 v61, v[0:3], s[6:7] offset:144
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:340 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:348 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:164 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:168 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:240 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:344 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:352 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(6)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v0, v49, v63, v0
+; GFX906-NEXT: s_waitcnt vmcnt(5)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: s_waitcnt vmcnt(4)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:260 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:264 ; 4-byte Folded Reload
+; GFX906-NEXT: v_and_or_b32 v2, v51, v63, v2
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v1, v50, v63, v1
+; GFX906-NEXT: v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:268 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:272 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v3, v52, v63, v3
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload
+; GFX906-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT: v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: global_store_dwordx4 v61, v[0:3], s[6:7] offset:160
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:244 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:248 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(4)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:252 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(4)
+; GFX906-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT: v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:256 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v0, v21, v63, v0
+; GFX906-NEXT: v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(4)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v1, v22, v63, v1
+; GFX906-NEXT: v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(4)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v2, v23, v63, v2
+; GFX906-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(4)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v3, v24, v63, v3
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX906-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b32_e32 v8, 24, v8
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT: global_store_dwordx4 v61, v[0:3], s[6:7] offset:176
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:292 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:300 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:296 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:304 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(5)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v0, v33, v63, v0
+; GFX906-NEXT: s_waitcnt vmcnt(4)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v0, v0, v5, v6
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:172 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:176 ; 4-byte Folded Reload
+; GFX906-NEXT: v_and_or_b32 v2, v35, v63, v2
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v1, v34, v63, v1
+; GFX906-NEXT: v_or3_b32 v1, v1, v7, v8
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v2, v2, v5, v6
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:228 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:232 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v3, v36, v63, v3
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT: global_store_dwordx4 v61, v[0:3], s[6:7] offset:192
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:792 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:796 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:800 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:804 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:136 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(7)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: s_waitcnt vmcnt(6)
+; GFX906-NEXT: v_and_or_b32 v0, v5, v63, v0
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX906-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GFX906-NEXT: v_or3_b32 v0, v0, v1, v2
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:128 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:132 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:148 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT: v_and_or_b32 v1, v6, v63, v1
+; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX906-NEXT: v_or3_b32 v1, v1, v2, v3
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:140 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:144 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:160 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 24, v5
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX906-NEXT: v_and_or_b32 v2, v7, v63, v2
+; GFX906-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX906-NEXT: v_or3_b32 v2, v2, v3, v5
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:152 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:156 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_and_or_b32 v3, v8, v63, v3
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT: global_store_dwordx4 v61, v[0:3], s[6:7] offset:208
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:180 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:184 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:188 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:200 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:224 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:212 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(5)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v0, v37, v63, v0
+; GFX906-NEXT: s_waitcnt vmcnt(4)
+; GFX906-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX906-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GFX906-NEXT: v_or3_b32 v0, v0, v1, v2
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:192 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:196 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(4)
+; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 24, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT: v_and_or_b32 v1, v38, v63, v1
+; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX906-NEXT: v_or3_b32 v1, v1, v2, v3
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:204 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:208 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX906-NEXT: v_and_or_b32 v2, v39, v63, v2
+; GFX906-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX906-NEXT: v_or3_b32 v2, v2, v3, v5
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:216 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:220 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX906-NEXT: v_and_or_b32 v3, v40, v63, v3
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX906-NEXT: v_or3_b32 v3, v3, v5, v6
+; GFX906-NEXT: global_store_dwordx4 v61, v[0:3], s[6:7] offset:224
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 ; 4-byte Folded Reload
; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GFX906-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GFX906-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
-; GFX906-NEXT: v_mov_b32_e32 v4, 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:784 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(7)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: s_waitcnt vmcnt(6)
+; GFX906-NEXT: v_and_or_b32 v0, v5, v63, v0
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX906-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[6:7]
-; GFX906-NEXT: global_store_dwordx4 v4, v[9:12], s[6:7] offset:16
-; GFX906-NEXT: global_store_dwordx4 v4, v[13:16], s[6:7] offset:32
-; GFX906-NEXT: global_store_dwordx4 v4, v[17:20], s[6:7] offset:48
-; GFX906-NEXT: global_store_dwordx4 v4, v[21:24], s[6:7] offset:64
-; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[6:7] offset:80
-; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[6:7] offset:96
-; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[6:7] offset:112
-; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[6:7] offset:128
-; GFX906-NEXT: global_store_dwordx4 v4, v[41:44], s[6:7] offset:144
-; GFX906-NEXT: global_store_dwordx4 v4, v[45:48], s[6:7] offset:160
-; GFX906-NEXT: global_store_dwordx4 v4, v[49:52], s[6:7] offset:176
-; GFX906-NEXT: global_store_dwordx4 v4, v[53:56], s[6:7] offset:192
-; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[6:7] offset:208
-; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:224
-; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
-; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GFX906-NEXT: v_or3_b32 v0, v0, v1, v2
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:788 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v1, v6, v63, v1
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:240
+; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX906-NEXT: v_or3_b32 v1, v1, v2, v3
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 24, v5
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX906-NEXT: v_and_or_b32 v2, v7, v63, v2
+; GFX906-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX906-NEXT: v_or3_b32 v2, v2, v3, v5
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 24, v19
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload
+; GFX906-NEXT: v_and_or_b32 v3, v8, v63, v3
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_and_b32_e32 v4, 0xff, v4
+; GFX906-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX906-NEXT: v_or3_b32 v3, v3, v4, v5
+; GFX906-NEXT: global_store_dwordx4 v61, v[0:3], s[6:7] offset:240
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -446,9 +2143,21 @@ define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr
; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX906-NEXT: global_load_dword v0, v0, s[2:3]
; GFX906-NEXT: .LBB7_5: ; %return.sink.split
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v2, 8, v0
; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c
+; GFX906-NEXT: v_mov_b32_e32 v5, 8
+; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX906-NEXT: v_mov_b32_e32 v4, 0xff
+; GFX906-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_lshrrev_b32_e32 v1, 24, v0
+; GFX906-NEXT: v_and_or_b32 v0, v0, v4, v2
+; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX906-NEXT: v_lshlrev_b32_e32 v1, 24, v1
+; GFX906-NEXT: v_or3_b32 v0, v0, v2, v1
; GFX906-NEXT: v_mov_b32_e32 v1, 0
-; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: global_store_dword v1, v0, s[0:1]
; GFX906-NEXT: .LBB7_6: ; %return
; GFX906-NEXT: s_endpgm
@@ -480,32 +2189,76 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-LABEL: v8i8_phi_chain:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX906-NEXT: v_lshlrev_b32_e32 v9, 3, v0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_xor_b64 s[0:1], vcc, -1
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[8:9]
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v9, s[8:9]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 24, v2
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX906-NEXT: s_cbranch_execz .LBB8_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[10:11]
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v9, s[10:11]
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
; GFX906-NEXT: s_and_b64 s[4:5], exec, vcc
; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 24, v2
; GFX906-NEXT: .LBB8_2: ; %Flow
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
; GFX906-NEXT: s_cbranch_execz .LBB8_4
; GFX906-NEXT: ; %bb.3: ; %bb.2
+; GFX906-NEXT: v_mov_b32_e32 v10, 8
+; GFX906-NEXT: v_mov_b32_e32 v0, 0xff
+; GFX906-NEXT: v_lshlrev_b32_sdwa v9, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_b32_e32 v11, 0xff, v7
+; GFX906-NEXT: v_and_b32_e32 v12, 0xff, v8
+; GFX906-NEXT: v_and_or_b32 v9, v1, v0, v9
+; GFX906-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX906-NEXT: v_lshlrev_b32_e32 v12, 24, v12
+; GFX906-NEXT: v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_or3_b32 v9, v9, v11, v12
+; GFX906-NEXT: v_and_or_b32 v0, v2, v0, v10
+; GFX906-NEXT: v_and_b32_e32 v10, 0xff, v4
+; GFX906-NEXT: v_and_b32_e32 v11, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX906-NEXT: v_lshlrev_b32_e32 v11, 24, v11
+; GFX906-NEXT: v_or3_b32 v10, v0, v10, v11
; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[12:13]
+; GFX906-NEXT: global_store_dwordx2 v0, v[9:10], s[12:13]
; GFX906-NEXT: .LBB8_4: ; %bb.3
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15]
+; GFX906-NEXT: v_mov_b32_e32 v10, 8
+; GFX906-NEXT: v_mov_b32_e32 v9, 0xff
+; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v0, v1, v9, v0
+; GFX906-NEXT: v_and_b32_e32 v1, 0xff, v7
+; GFX906-NEXT: v_and_b32_e32 v6, 0xff, v8
+; GFX906-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; GFX906-NEXT: v_or3_b32 v0, v0, v1, v6
+; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v1, v2, v9, v1
+; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v4
+; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v5
+; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GFX906-NEXT: v_or3_b32 v1, v1, v2, v3
+; GFX906-NEXT: v_mov_b32_e32 v2, 0
+; GFX906-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -534,17 +2287,23 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
; GFX906-LABEL: v8i8_multi_block:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0
+; GFX906-NEXT: v_lshlrev_b32_e32 v11, 3, v0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[8:9]
+; GFX906-NEXT: global_load_dwordx2 v[3:4], v11, s[8:9]
; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 8, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v10, 24, v4
; GFX906-NEXT: v_mov_b32_e32 v1, v3
; GFX906-NEXT: v_mov_b32_e32 v2, v4
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB9_4
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[10:11]
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v11, s[10:11]
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX906-NEXT: s_cbranch_execz .LBB9_3
@@ -553,11 +2312,31 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[12:13]
; GFX906-NEXT: .LBB9_3: ; %Flow
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v10, 24, v2
; GFX906-NEXT: .LBB9_4: ; %bb.3
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15]
+; GFX906-NEXT: v_mov_b32_e32 v4, 8
+; GFX906-NEXT: v_mov_b32_e32 v3, 0xff
+; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v0, v1, v3, v0
+; GFX906-NEXT: v_and_b32_e32 v1, 0xff, v6
+; GFX906-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 24, v7
+; GFX906-NEXT: v_or3_b32 v0, v0, v1, v5
+; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_or_b32 v1, v2, v3, v1
+; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v9
+; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v10
+; GFX906-NEXT: v_or3_b32 v1, v1, v2, v3
+; GFX906-NEXT: v_mov_b32_e32 v2, 0
+; GFX906-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -586,32 +2365,45 @@ define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrsp
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX906-NEXT: v_lshlrev_b32_e32 v1, 5, v0
-; GFX906-NEXT: v_mov_b32_e32 v3, 8
-; GFX906-NEXT: v_mov_b32_e32 v2, 0xff
+; GFX906-NEXT: v_mov_b32_e32 v4, 8
+; GFX906-NEXT: v_mov_b32_e32 v3, 0xff
; GFX906-NEXT: v_cmp_le_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: global_load_dword v1, v1, s[0:1]
; GFX906-NEXT: s_mov_b64 s[0:1], 0
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT: v_and_or_b32 v0, v1, v2, v0
-; GFX906-NEXT: v_mov_b32_e32 v2, 24
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 8, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v2, 24, v1
+; GFX906-NEXT: v_and_or_b32 v0, v1, v3, v0
+; GFX906-NEXT: v_mov_b32_e32 v6, v5
; GFX906-NEXT: .LBB10_1: ; %bb.1
; GFX906-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v1
+; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT: v_and_b32_e32 v6, 0xff, v6
+; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT: v_and_or_b32 v7, v1, v3, v7
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX906-NEXT: v_lshlrev_b32_e32 v2, 24, v2
; GFX906-NEXT: s_and_b64 s[2:3], exec, vcc
-; GFX906-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX906-NEXT: v_or3_b32 v6, v7, v6, v2
; GFX906-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX906-NEXT: v_or3_b32 v1, v0, v3, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX906-NEXT: v_mov_b32_e32 v7, v5
; GFX906-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX906-NEXT: s_cbranch_execnz .LBB10_1
; GFX906-NEXT: ; %bb.2: ; %bb.2.loopexit
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
+; GFX906-NEXT: v_and_b32_e32 v1, 0xff, v6
+; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX906-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GFX906-NEXT: v_or3_b32 v0, v0, v1, v2
+; GFX906-NEXT: v_mov_b32_e32 v1, 0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX906-NEXT: global_store_dword v1, v0, s[0:1]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
index fcb8fa5997b7e8..28fcfa0070441d 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
@@ -8,30 +8,29 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_dword s0, s[8:9], 0x0
; CHECK-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x8
-; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_bitcmp0_b32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB0_2
; CHECK-NEXT: ; %bb.1: ; %bb10
-; CHECK-NEXT: global_load_dwordx2 v[8:9], v0, s[12:13]
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: global_load_dwordx2 v[0:1], v0, s[12:13]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_and_b32_e32 v7, 0xff, v8
-; CHECK-NEXT: v_bfe_u32 v6, v8, 8, 8
-; CHECK-NEXT: v_bfe_u32 v5, v8, 16, 8
-; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v8
-; CHECK-NEXT: v_and_b32_e32 v3, 0xff, v9
-; CHECK-NEXT: v_bfe_u32 v2, v9, 8, 8
-; CHECK-NEXT: v_bfe_u32 v1, v9, 16, 8
-; CHECK-NEXT: v_lshrrev_b32_e32 v0, 24, v9
+; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v0
+; CHECK-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v0
+; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v1
+; CHECK-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v1
; CHECK-NEXT: s_branch .LBB0_3
; CHECK-NEXT: .LBB0_2:
-; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_mov_b32_e32 v3, 0
; CHECK-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: v_mov_b32_e32 v5, 0
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: v_mov_b32_e32 v7, 0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: .LBB0_3: ; %bb41
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x48
; CHECK-NEXT: v_mov_b32_e32 v8, s14
@@ -48,16 +47,16 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr
; CHECK-NEXT: v_mov_b32_e32 v19, s25
; CHECK-NEXT: v_mov_b32_e32 v20, s26
; CHECK-NEXT: v_mov_b32_e32 v21, s27
-; CHECK-NEXT: flat_store_byte v[8:9], v7
-; CHECK-NEXT: flat_store_byte v[10:11], v6
-; CHECK-NEXT: flat_store_byte v[12:13], v5
-; CHECK-NEXT: flat_store_byte v[14:15], v4
-; CHECK-NEXT: flat_store_byte v[16:17], v3
-; CHECK-NEXT: flat_store_byte v[18:19], v2
-; CHECK-NEXT: flat_store_byte v[20:21], v1
+; CHECK-NEXT: flat_store_byte v[8:9], v0
+; CHECK-NEXT: flat_store_byte v[10:11], v7
+; CHECK-NEXT: flat_store_byte v[12:13], v6
+; CHECK-NEXT: flat_store_byte v[14:15], v5
+; CHECK-NEXT: flat_store_byte v[16:17], v1
+; CHECK-NEXT: flat_store_byte v[18:19], v4
+; CHECK-NEXT: flat_store_byte v[20:21], v3
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
-; CHECK-NEXT: flat_store_byte v[2:3], v0
+; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; CHECK-NEXT: flat_store_byte v[0:1], v2
; CHECK-NEXT: s_endpgm
bb:
br i1 %arg, label %bb10, label %bb41
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index efbbe2b27f10f9..6dabd8c0b83eae 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -13,9 +13,9 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -30,25 +30,27 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_or_b32_e32 v3, v6, v2
-; SI-NEXT: v_or_b32_e32 v2, v4, v5
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; SI-NEXT: v_or_b32_e32 v2, v6, v2
+; SI-NEXT: v_or_b32_e32 v3, v5, v3
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB0_3
; SI-NEXT: s_branch .LBB0_4
; SI-NEXT: .LBB0_2:
-; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: s_mov_b64 vcc, 0
; SI-NEXT: .LBB0_3: ; %T
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -61,29 +63,29 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v4, v0
-; SI-NEXT: v_or_b32_e32 v2, v2, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT: v_or_b32_e32 v2, v2, v0
+; SI-NEXT: v_or_b32_e32 v3, v3, v1
; SI-NEXT: .LBB0_4: ; %exit
-; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v2
-; SI-NEXT: v_bfe_i32 v1, v2, 0, 16
-; SI-NEXT: v_bfe_i32 v2, v3, 0, 16
-; SI-NEXT: v_mov_b32_e32 v3, 0xffff0000
-; SI-NEXT: v_bfrev_b32_e32 v4, 1
-; SI-NEXT: v_mov_b32_e32 v5, 0xffff
-; SI-NEXT: v_mov_b32_e32 v6, 0x8000
+; SI-NEXT: v_bfe_i32 v0, v3, 0, 16
+; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
+; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
+; SI-NEXT: v_mov_b32_e32 v3, 0xffff
+; SI-NEXT: v_mov_b32_e32 v4, 0x8000
+; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000
+; SI-NEXT: v_bfrev_b32_e32 v6, 1
; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
-; SI-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
-; SI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT: v_cndmask_b32_e32 v1, -1, v7, vcc
-; SI-NEXT: v_or_b32_e32 v0, v0, v4
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16
+; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; SI-NEXT: v_or_b32_e32 v2, v3, v4
+; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: vec_8xi16_extract_4xi16:
@@ -178,23 +180,26 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; SI-NEXT: v_or_b32_e32 v5, v6, v2
-; SI-NEXT: v_or_b32_e32 v4, v4, v3
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; SI-NEXT: v_or_b32_e32 v3, v6, v3
+; SI-NEXT: v_or_b32_e32 v5, v5, v7
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB1_3
; SI-NEXT: s_branch .LBB1_4
; SI-NEXT: .LBB1_2:
-; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: s_mov_b64 vcc, 0
; SI-NEXT: .LBB1_3: ; %T
; SI-NEXT: s_mov_b32 s6, 0
@@ -209,39 +214,39 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc
+; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT: v_or_b32_e32 v5, v4, v0
-; SI-NEXT: v_or_b32_e32 v4, v2, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v0
+; SI-NEXT: v_or_b32_e32 v5, v5, v1
; SI-NEXT: .LBB1_4: ; %exit
-; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v4
-; SI-NEXT: v_ashr_i64 v[0:1], v[4:5], 48
+; SI-NEXT: v_bfe_i32 v0, v5, 0, 16
; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
-; SI-NEXT: v_bfe_i32 v3, v5, 0, 16
-; SI-NEXT: v_mov_b32_e32 v4, 0xffff0000
-; SI-NEXT: v_bfrev_b32_e32 v5, 1
-; SI-NEXT: v_mov_b32_e32 v6, 0xffff
-; SI-NEXT: v_mov_b32_e32 v7, 0x8000
-; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc
+; SI-NEXT: v_bfe_i32 v3, v3, 0, 16
+; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
+; SI-NEXT: v_mov_b32_e32 v4, 0xffff
+; SI-NEXT: v_mov_b32_e32 v5, 0x8000
+; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000
+; SI-NEXT: v_bfrev_b32_e32 v7, 1
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
+; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3
-; SI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
-; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
; SI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; SI-NEXT: v_or_b32_e32 v0, v1, v8
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_alignbit_b32 v1, v2, v8, 16
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
+; SI-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v2, v3, v4
+; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: vec_8xi16_extract_4xi16_2:
@@ -494,9 +499,9 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -527,25 +532,27 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT: v_or_b32_e32 v3, v6, v2
-; SI-NEXT: v_or_b32_e32 v2, v4, v5
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; SI-NEXT: v_or_b32_e32 v2, v6, v2
+; SI-NEXT: v_or_b32_e32 v3, v5, v3
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB3_3
; SI-NEXT: s_branch .LBB3_4
; SI-NEXT: .LBB3_2:
-; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr2
; SI-NEXT: s_mov_b64 vcc, 0
; SI-NEXT: .LBB3_3: ; %T
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -574,29 +581,29 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT: v_or_b32_e32 v3, v4, v0
-; SI-NEXT: v_or_b32_e32 v2, v2, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT: v_or_b32_e32 v2, v2, v0
+; SI-NEXT: v_or_b32_e32 v3, v3, v1
; SI-NEXT: .LBB3_4: ; %exit
-; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v2
-; SI-NEXT: v_bfe_i32 v1, v2, 0, 16
-; SI-NEXT: v_bfe_i32 v2, v3, 0, 16
-; SI-NEXT: v_mov_b32_e32 v3, 0xffff0000
-; SI-NEXT: v_bfrev_b32_e32 v4, 1
-; SI-NEXT: v_mov_b32_e32 v5, 0xffff
-; SI-NEXT: v_mov_b32_e32 v6, 0x8000
+; SI-NEXT: v_bfe_i32 v0, v3, 0, 16
+; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
+; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
+; SI-NEXT: v_mov_b32_e32 v3, 0xffff
+; SI-NEXT: v_mov_b32_e32 v4, 0x8000
+; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000
+; SI-NEXT: v_bfrev_b32_e32 v6, 1
; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
-; SI-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
-; SI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT: v_cndmask_b32_e32 v1, -1, v7, vcc
-; SI-NEXT: v_or_b32_e32 v0, v0, v4
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1
-; SI-NEXT: v_or_b32_e32 v2, v3, v2
-; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16
+; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; SI-NEXT: v_or_b32_e32 v2, v3, v4
+; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: vec_16xi16_extract_4xi16:
@@ -703,13 +710,13 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
+; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:14 glc
+; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -727,15 +734,18 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; SI-NEXT: v_or_b32_e32 v5, v6, v2
-; SI-NEXT: v_or_b32_e32 v4, v4, v3
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; SI-NEXT: v_or_b32_e32 v2, v7, v2
+; SI-NEXT: v_or_b32_e32 v3, v6, v3
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB4_3
; SI-NEXT: s_branch .LBB4_4
; SI-NEXT: .LBB4_2:
-; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: s_mov_b64 vcc, 0
; SI-NEXT: .LBB4_3: ; %T
; SI-NEXT: s_mov_b32 s6, 0
@@ -750,11 +760,11 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -775,29 +785,29 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT: v_or_b32_e32 v5, v4, v0
-; SI-NEXT: v_or_b32_e32 v4, v2, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; SI-NEXT: v_or_b32_e32 v2, v2, v0
+; SI-NEXT: v_or_b32_e32 v3, v3, v1
; SI-NEXT: .LBB4_4: ; %exit
-; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v4
-; SI-NEXT: v_ashr_i64 v[0:1], v[4:5], 48
+; SI-NEXT: v_bfe_i32 v0, v3, 0, 16
; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
+; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
; SI-NEXT: v_bfe_i32 v3, v5, 0, 16
-; SI-NEXT: v_mov_b32_e32 v4, 0xffff0000
-; SI-NEXT: v_bfrev_b32_e32 v5, 1
-; SI-NEXT: v_mov_b32_e32 v6, 0xffff
-; SI-NEXT: v_mov_b32_e32 v7, 0x8000
-; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
-; SI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc
+; SI-NEXT: v_mov_b32_e32 v4, 0xffff
+; SI-NEXT: v_mov_b32_e32 v5, 0x8000
+; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000
+; SI-NEXT: v_bfrev_b32_e32 v7, 1
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
+; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
+; SI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3
-; SI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
-; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
-; SI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; SI-NEXT: v_or_b32_e32 v0, v1, v8
+; SI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_alignbit_b32 v1, v2, v8, 16
+; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1195,21 +1205,21 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
; SI-NEXT: s_mov_b32 s39, 0xf000
; SI-NEXT: s_mov_b32 s36, s38
; SI-NEXT: s_mov_b32 s37, s38
-; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:4 glc
+; SI-NEXT: buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:6 glc
+; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:12 glc
+; SI-NEXT: buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:14 glc
+; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -1227,39 +1237,46 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[36:39], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v4
-; SI-NEXT: v_or_b32_e32 v5, v10, v2
-; SI-NEXT: v_or_b32_e32 v4, v8, v3
-; SI-NEXT: v_or_b32_e32 v3, v7, v9
-; SI-NEXT: v_or_b32_e32 v2, v6, v11
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
+; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v6
+; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v4
+; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5
+; SI-NEXT: v_or_b32_e32 v3, v11, v2
+; SI-NEXT: v_or_b32_e32 v8, v8, v12
+; SI-NEXT: v_or_b32_e32 v2, v10, v13
+; SI-NEXT: v_or_b32_e32 v9, v9, v14
; SI-NEXT: s_mov_b64 vcc, exec
; SI-NEXT: s_cbranch_execz .LBB7_3
; SI-NEXT: s_branch .LBB7_4
; SI-NEXT: .LBB7_2:
-; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; SI-NEXT: ; implicit-def: $vgpr9
+; SI-NEXT: ; implicit-def: $vgpr5
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; implicit-def: $vgpr4
+; SI-NEXT: ; implicit-def: $vgpr8
+; SI-NEXT: ; implicit-def: $vgpr6
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: ; implicit-def: $vgpr7
; SI-NEXT: s_mov_b64 vcc, 0
; SI-NEXT: .LBB7_3: ; %T
; SI-NEXT: s_mov_b32 s39, 0xf000
; SI-NEXT: s_mov_b32 s36, s38
; SI-NEXT: s_mov_b32 s37, s38
-; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 glc
+; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:2 glc
+; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:2 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:4 glc
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 offset:4 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:6 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:8 glc
+; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:8 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:10 glc
+; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:10 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:12 glc
+; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:12 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:14 glc
+; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:14 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -1277,52 +1294,52 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[36:39], 0 addr64 offset:30 glc
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
-; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v4
-; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v3
-; SI-NEXT: v_or_b32_e32 v5, v8, v0
-; SI-NEXT: v_or_b32_e32 v4, v7, v1
-; SI-NEXT: v_or_b32_e32 v3, v6, v9
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6
+; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4
+; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v5
+; SI-NEXT: v_or_b32_e32 v3, v3, v0
+; SI-NEXT: v_or_b32_e32 v8, v8, v1
; SI-NEXT: v_or_b32_e32 v2, v2, v10
+; SI-NEXT: v_or_b32_e32 v9, v9, v11
; SI-NEXT: .LBB7_4: ; %exit
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
-; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5
-; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8
+; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
; SI-NEXT: s_movk_i32 s34, 0x3800
-; SI-NEXT: v_mov_b32_e32 v8, 0x3d000000
-; SI-NEXT: v_mov_b32_e32 v9, 0x39000000
-; SI-NEXT: v_mov_b32_e32 v10, 0x3d00
-; SI-NEXT: v_mov_b32_e32 v11, 0x3900
+; SI-NEXT: v_mov_b32_e32 v8, 0x3d00
+; SI-NEXT: v_mov_b32_e32 v9, 0x3900
+; SI-NEXT: v_mov_b32_e32 v10, 0x3d000000
+; SI-NEXT: v_mov_b32_e32 v11, 0x39000000
; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v0
-; SI-NEXT: v_cndmask_b32_e32 v12, v8, v9, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc
; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v1
-; SI-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc
-; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v2
-; SI-NEXT: v_cndmask_b32_e32 v13, v8, v9, vcc
-; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v4
; SI-NEXT: v_cndmask_b32_e32 v1, v10, v11, vcc
-; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v6
-; SI-NEXT: v_cndmask_b32_e32 v14, v8, v9, vcc
; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v5
-; SI-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc
-; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v7
; SI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v6
+; SI-NEXT: v_cndmask_b32_e32 v12, v10, v11, vcc
; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v3
-; SI-NEXT: v_cndmask_b32_e32 v3, v10, v11, vcc
-; SI-NEXT: v_or_b32_e32 v0, v0, v12
-; SI-NEXT: v_or_b32_e32 v4, v1, v13
-; SI-NEXT: v_or_b32_e32 v6, v2, v14
-; SI-NEXT: v_or_b32_e32 v2, v3, v5
-; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; SI-NEXT: v_alignbit_b32 v1, v2, v12, 16
-; SI-NEXT: v_alignbit_b32 v5, v6, v13, 16
-; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v14
+; SI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
+; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v7
+; SI-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc
+; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v2
+; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
+; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v4
+; SI-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v4, v5, v12
+; SI-NEXT: v_or_b32_e32 v6, v3, v7
+; SI-NEXT: v_or_b32_e32 v2, v2, v8
+; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT: v_alignbit_b32 v5, v6, v12, 16
+; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: vec_16xi16_extract_8xi16_0:
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
index 36a93bd2511ced..1e86842be4e5e5 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
@@ -8,7 +8,8 @@ define <2 x i16> @extract_2xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
; GCN-NEXT: v_and_b32_e32 v4, 1, v4
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
-; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
+; GCN-NEXT: ; implicit-def: $vgpr5
+; GCN-NEXT: ; implicit-def: $vgpr4
; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
; GCN-NEXT: s_cbranch_execz .LBB0_2
@@ -19,22 +20,22 @@ define <2 x i16> @extract_2xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
; GCN-NEXT: s_mov_b32 s9, s10
; GCN-NEXT: buffer_load_ushort v0, v[2:3], s[8:11], 0 addr64 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v1, v[2:3], s[8:11], 0 addr64 offset:2 glc
+; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:2 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:4 glc
+; GCN-NEXT: buffer_load_ushort v1, v[2:3], s[8:11], 0 addr64 offset:4 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:6 glc
+; GCN-NEXT: buffer_load_ushort v1, v[2:3], s[8:11], 0 addr64 offset:6 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:8 glc
+; GCN-NEXT: buffer_load_ushort v1, v[2:3], s[8:11], 0 addr64 offset:8 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:10 glc
+; GCN-NEXT: buffer_load_ushort v1, v[2:3], s[8:11], 0 addr64 offset:10 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:12 glc
+; GCN-NEXT: buffer_load_ushort v1, v[2:3], s[8:11], 0 addr64 offset:12 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v2, v[2:3], s[8:11], 0 addr64 offset:14 glc
+; GCN-NEXT: buffer_load_ushort v1, v[2:3], s[8:11], 0 addr64 offset:14 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_or_b32_e32 v4, v0, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4
+; GCN-NEXT: v_or_b32_e32 v5, v0, v1
; GCN-NEXT: ; implicit-def: $vgpr0
; GCN-NEXT: .LBB0_2: ; %Flow
; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
@@ -46,36 +47,36 @@ define <2 x i16> @extract_2xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
; GCN-NEXT: s_mov_b32 s9, s10
; GCN-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 offset:2 glc
+; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:2 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:4 glc
+; GCN-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:6 glc
+; GCN-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 offset:6 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
+; GCN-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 offset:8 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:10 glc
+; GCN-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 offset:10 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:12 glc
+; GCN-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 offset:12 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 offset:14 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GCN-NEXT: v_or_b32_e32 v4, v2, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GCN-NEXT: v_or_b32_e32 v5, v2, v0
; GCN-NEXT: .LBB0_4: ; %exit
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT: v_ashrrev_i32_e32 v0, 16, v4
+; GCN-NEXT: v_bfe_i32 v0, v5, 0, 16
; GCN-NEXT: v_bfe_i32 v1, v4, 0, 16
; GCN-NEXT: v_mov_b32_e32 v2, 0xffff
; GCN-NEXT: v_mov_b32_e32 v3, 0x8000
; GCN-NEXT: v_mov_b32_e32 v4, 0xffff8000
-; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GCN-NEXT: v_or_b32_e32 v0, v1, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
+; GCN-NEXT: v_cndmask_b32_e32 v1, -1, v4, vcc
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GCN-NEXT: v_or_b32_e32 v0, v0, v2
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
br i1 %c0, label %T, label %F
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 459ef648fd806c..dcfcffb50c188d 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -2102,15 +2102,18 @@ define void @crash_lshlrevb16_not_reg_op() {
; NOSDWA: ; %bb.0: ; %bb0
; NOSDWA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; NOSDWA-NEXT: s_mov_b64 s[4:5], 0
+; NOSDWA-NEXT: s_and_b32 s6, s4, 0xff
+; NOSDWA-NEXT: s_bitset1_b32 s6, 8
+; NOSDWA-NEXT: s_and_b32 s6, s6, 0x1ff
; NOSDWA-NEXT: s_and_b64 vcc, exec, -1
; NOSDWA-NEXT: .LBB22_1: ; %bb1
; NOSDWA-NEXT: ; =>This Inner Loop Header: Depth=1
-; NOSDWA-NEXT: s_lshl_b32 s6, s4, 3
+; NOSDWA-NEXT: s_lshl_b32 s7, s4, 3
; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
-; NOSDWA-NEXT: s_lshr_b32 s6, 0x100, s6
+; NOSDWA-NEXT: s_lshr_b32 s7, s6, s7
; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
; NOSDWA-NEXT: s_mov_b64 s[4:5], 1
-; NOSDWA-NEXT: v_mov_b32_e32 v2, s6
+; NOSDWA-NEXT: v_mov_b32_e32 v2, s7
; NOSDWA-NEXT: flat_store_byte v[0:1], v2
; NOSDWA-NEXT: s_mov_b64 vcc, vcc
; NOSDWA-NEXT: s_cbranch_vccnz .LBB22_1
@@ -2122,15 +2125,18 @@ define void @crash_lshlrevb16_not_reg_op() {
; GFX89: ; %bb.0: ; %bb0
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX89-NEXT: s_mov_b64 s[4:5], 0
+; GFX89-NEXT: s_and_b32 s6, s4, 0xff
+; GFX89-NEXT: s_bitset1_b32 s6, 8
+; GFX89-NEXT: s_and_b32 s6, s6, 0x1ff
; GFX89-NEXT: s_and_b64 vcc, exec, -1
; GFX89-NEXT: .LBB22_1: ; %bb1
; GFX89-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX89-NEXT: s_lshl_b32 s6, s4, 3
+; GFX89-NEXT: s_lshl_b32 s7, s4, 3
; GFX89-NEXT: v_mov_b32_e32 v0, s4
-; GFX89-NEXT: s_lshr_b32 s6, 0x100, s6
+; GFX89-NEXT: s_lshr_b32 s7, s6, s7
; GFX89-NEXT: v_mov_b32_e32 v1, s5
; GFX89-NEXT: s_mov_b64 s[4:5], 1
-; GFX89-NEXT: v_mov_b32_e32 v2, s6
+; GFX89-NEXT: v_mov_b32_e32 v2, s7
; GFX89-NEXT: flat_store_byte v[0:1], v2
; GFX89-NEXT: s_mov_b64 vcc, vcc
; GFX89-NEXT: s_cbranch_vccnz .LBB22_1
@@ -2142,15 +2148,18 @@ define void @crash_lshlrevb16_not_reg_op() {
; GFX9: ; %bb.0: ; %bb0
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: s_and_b32 s6, s4, 0xff
+; GFX9-NEXT: s_bitset1_b32 s6, 8
+; GFX9-NEXT: s_and_b32 s6, s6, 0x1ff
; GFX9-NEXT: s_and_b64 vcc, exec, -1
; GFX9-NEXT: .LBB22_1: ; %bb1
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_lshl_b32 s6, s4, 3
+; GFX9-NEXT: s_lshl_b32 s7, s4, 3
; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: s_lshr_b32 s6, 0x100, s6
+; GFX9-NEXT: s_lshr_b32 s7, s6, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: s_mov_b64 s[4:5], 1
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NEXT: flat_store_byte v[0:1], v2
; GFX9-NEXT: s_mov_b64 vcc, vcc
; GFX9-NEXT: s_cbranch_vccnz .LBB22_1
@@ -2161,14 +2170,17 @@ define void @crash_lshlrevb16_not_reg_op() {
; GFX10-LABEL: crash_lshlrevb16_not_reg_op:
; GFX10: ; %bb.0: ; %bb0
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b64 s[4:5], 0
+; GFX10-NEXT: s_and_b32 s4, s4, 0xff
; GFX10-NEXT: s_mov_b32 vcc_lo, exec_lo
+; GFX10-NEXT: s_or_b32 s6, s4, 0x100
+; GFX10-NEXT: s_mov_b64 s[4:5], 0
+; GFX10-NEXT: s_and_b32 s6, s6, 0x1ff
; GFX10-NEXT: .LBB22_1: ; %bb1
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_lshl_b32 s6, s4, 3
+; GFX10-NEXT: s_lshl_b32 s7, s4, 3
; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-NEXT: s_lshr_b32 s4, 0x100, s6
+; GFX10-NEXT: s_lshr_b32 s4, s6, s7
; GFX10-NEXT: v_mov_b32_e32 v2, s4
; GFX10-NEXT: s_mov_b64 s[4:5], 1
; GFX10-NEXT: flat_store_byte v[0:1], v2
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index b5e4bcd049c42a..bc1aefb258eadd 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -6,31 +6,27 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX906-NEXT: v_mov_b32_e32 v3, 8
-; GFX906-NEXT: s_mov_b32 s4, 0xff0000
-; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dword v4, v2, s[0:1]
+; GFX906-NEXT: v_lshlrev_b32_e32 v5, 2, v0
; GFX906-NEXT: v_mov_b32_e32 v1, 0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT: s_waitcnt lgkmcnt(0)
+; GFX906-NEXT: global_load_dword v2, v5, s[0:1]
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshrrev_b32_sdwa v5, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX906-NEXT: v_and_or_b32 v4, v4, s4, v5
+; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 8, v2
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB0_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dword v0, v2, s[2:3]
+; GFX906-NEXT: global_load_dword v2, v5, s[2:3]
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_lshrrev_b32_sdwa v2, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX906-NEXT: v_and_or_b32 v4, v0, s4, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 8, v2
; GFX906-NEXT: .LBB0_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: global_store_byte_d16_hi v1, v4, s[6:7] offset:2
-; GFX906-NEXT: global_store_short v1, v4, s[6:7]
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v4
+; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_byte v1, v3, s[6:7] offset:2
+; GFX906-NEXT: global_store_short v1, v0, s[6:7]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -54,19 +50,31 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v3, 2, v0
+; GFX906-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; GFX906-NEXT: v_mov_b32_e32 v1, 0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dword v2, v3, s[0:1]
+; GFX906-NEXT: global_load_dword v2, v6, s[0:1]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 8, v2
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB1_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dword v2, v3, s[2:3]
+; GFX906-NEXT: global_load_dword v2, v6, s[2:3]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 8, v2
; GFX906-NEXT: .LBB1_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dword v1, v2, s[6:7]
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v5
+; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dword v1, v0, s[6:7]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -90,23 +98,32 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX906-NEXT: v_mov_b32_e32 v3, 0
+; GFX906-NEXT: v_lshlrev_b32_e32 v7, 3, v0
+; GFX906-NEXT: v_mov_b32_e32 v5, 0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[0:1]
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v7, s[0:1]
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v1
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB2_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[2:3]
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v7, s[2:3]
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX906-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v1
; GFX906-NEXT: .LBB2_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: global_store_byte v3, v2, s[6:7] offset:4
-; GFX906-NEXT: global_store_dword v3, v1, s[6:7]
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v6
+; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_byte v5, v2, s[6:7] offset:4
+; GFX906-NEXT: global_store_dword v5, v0, s[6:7]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -130,19 +147,42 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX906-NEXT: v_lshlrev_b32_e32 v10, 3, v0
; GFX906-NEXT: v_mov_b32_e32 v3, 0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[0:1]
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v10, s[0:1]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v1
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB3_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[2:3]
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v10, s[2:3]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v1
; GFX906-NEXT: .LBB3_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx2 v3, v[1:2], s[6:7]
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v9
+; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v7
+; GFX906-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v6
+; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v4
+; GFX906-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx2 v3, v[0:1], s[6:7]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -166,19 +206,64 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v6, 4, v0
+; GFX906-NEXT: v_lshlrev_b32_e32 v18, 4, v0
; GFX906-NEXT: v_mov_b32_e32 v5, 0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx4 v[1:4], v6, s[0:1]
+; GFX906-NEXT: global_load_dwordx4 v[1:4], v18, s[0:1]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 24, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 16, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v13, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v1
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB4_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx4 v[1:4], v6, s[2:3]
+; GFX906-NEXT: global_load_dwordx4 v[1:4], v18, s[2:3]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 24, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 16, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v13, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v1
; GFX906-NEXT: .LBB4_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx4 v5, v[1:4], s[6:7]
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v17
+; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v15
+; GFX906-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v14
+; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v12
+; GFX906-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v11
+; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v9
+; GFX906-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v8
+; GFX906-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v6
+; GFX906-NEXT: v_or_b32_sdwa v4, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -202,23 +287,113 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v10, 5, v0
+; GFX906-NEXT: v_lshlrev_b32_e32 v31, 5, v0
; GFX906-NEXT: v_mov_b32_e32 v9, 0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[0:1] offset:16
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[0:1]
+; GFX906-NEXT: global_load_dwordx4 v[1:4], v31, s[0:1] offset:16
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v31, s[0:1]
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v13, 16, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v18, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v19, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v20, 8, v1
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v21, 24, v8
+; GFX906-NEXT: v_lshrrev_b32_e32 v22, 16, v8
+; GFX906-NEXT: v_lshrrev_b32_e32 v23, 8, v8
+; GFX906-NEXT: v_lshrrev_b32_e32 v24, 24, v7
+; GFX906-NEXT: v_lshrrev_b32_e32 v25, 16, v7
+; GFX906-NEXT: v_lshrrev_b32_e32 v26, 8, v7
+; GFX906-NEXT: v_lshrrev_b32_e32 v27, 24, v6
+; GFX906-NEXT: v_lshrrev_b32_e32 v28, 16, v6
+; GFX906-NEXT: v_lshrrev_b32_e32 v29, 8, v6
+; GFX906-NEXT: v_lshrrev_b32_e32 v30, 24, v5
+; GFX906-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX906-NEXT: v_lshrrev_b32_e32 v33, 8, v5
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB5_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[2:3] offset:16
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[2:3]
+; GFX906-NEXT: global_load_dwordx4 v[1:4], v31, s[2:3] offset:16
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v31, s[2:3]
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v13, 16, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v14, 8, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v16, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v18, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v19, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v20, 8, v1
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v21, 24, v8
+; GFX906-NEXT: v_lshrrev_b32_e32 v22, 16, v8
+; GFX906-NEXT: v_lshrrev_b32_e32 v23, 8, v8
+; GFX906-NEXT: v_lshrrev_b32_e32 v24, 24, v7
+; GFX906-NEXT: v_lshrrev_b32_e32 v25, 16, v7
+; GFX906-NEXT: v_lshrrev_b32_e32 v26, 8, v7
+; GFX906-NEXT: v_lshrrev_b32_e32 v27, 24, v6
+; GFX906-NEXT: v_lshrrev_b32_e32 v28, 16, v6
+; GFX906-NEXT: v_lshrrev_b32_e32 v29, 8, v6
+; GFX906-NEXT: v_lshrrev_b32_e32 v30, 24, v5
+; GFX906-NEXT: v_lshrrev_b32_e32 v32, 16, v5
+; GFX906-NEXT: v_lshrrev_b32_e32 v33, 8, v5
; GFX906-NEXT: .LBB5_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[6:7] offset:16
-; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v30, 8, v30
+; GFX906-NEXT: v_lshlrev_b16_e32 v31, 8, v33
+; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29
+; GFX906-NEXT: v_lshlrev_b16_e32 v27, 8, v27
+; GFX906-NEXT: v_lshlrev_b16_e32 v26, 8, v26
+; GFX906-NEXT: v_lshlrev_b16_e32 v24, 8, v24
+; GFX906-NEXT: v_lshlrev_b16_e32 v23, 8, v23
+; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21
+; GFX906-NEXT: v_or_b32_sdwa v30, v32, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v5, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v6, v6, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v7, v7, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v8, v8, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v5, v5, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v6, v6, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v7, v7, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v8, v8, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[6:7]
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v20
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v18
+; GFX906-NEXT: v_or_b32_sdwa v5, v19, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v17
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v15
+; GFX906-NEXT: v_or_b32_sdwa v5, v16, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v14
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v12
+; GFX906-NEXT: v_or_b32_sdwa v5, v13, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v11
+; GFX906-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[6:7] offset:16
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -242,96 +417,1559 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT: v_lshlrev_b32_e32 v61, 3, v0
+; GFX906-NEXT: v_lshlrev_b32_e32 v63, 3, v0
; GFX906-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX906-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[0:1] offset:240
+; GFX906-NEXT: global_load_dwordx4 v[17:20], v63, s[0:1] offset:240
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v63, s[0:1] offset:224
+; GFX906-NEXT: global_load_dwordx4 v[9:12], v63, s[0:1] offset:208
+; GFX906-NEXT: global_load_dwordx4 v[13:16], v63, s[0:1] offset:192
; GFX906-NEXT: s_mov_b32 s14, -1
; GFX906-NEXT: s_mov_b32 s15, 0xe00000
; GFX906-NEXT: s_add_u32 s12, s12, s11
; GFX906-NEXT: s_addc_u32 s13, s13, 0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: v_mov_b32_e32 v4, 0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: buffer_store_dword v5, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:52 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 ; 4-byte Folded Spill
; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[0:1] offset:224
+; GFX906-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:60 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(18)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v8
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:64 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v8
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:68 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v8
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:72 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v7
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:76 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v7
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:80 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v7
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:84 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v6
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:88 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:92 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v6
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:96 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v5
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:100 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:104 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v5
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:108 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(29)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v12
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:112 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v12
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:116 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v12
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:120 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v11
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:124 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v11
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:128 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v11
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:132 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v10
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:136 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v10
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:140 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v10
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:144 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v9
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:148 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v9
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:152 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v9
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:156 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(40)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v16
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:160 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v16
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:164 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v16
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:168 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v15
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:176 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v15
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:180 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v15
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:172 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v14
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:188 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v14
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:192 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v14
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:184 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v13
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:200 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v13
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:204 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v13
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:196 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[17:20], v63, s[0:1] offset:176
+; GFX906-NEXT: global_load_dwordx4 v[21:24], v63, s[0:1] offset:160
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:208 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:212 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:224 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:216 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:220 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:236 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:228 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:232 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:248 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:240 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:244 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:252 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v24
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:256 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v24
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:260 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v24
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:272 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v23
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:264 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v23
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:268 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v23
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:284 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v22
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:276 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v22
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:280 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v22
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:296 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v21
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:288 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v21
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:292 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v21
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:300 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[25:28], v63, s[0:1] offset:144
+; GFX906-NEXT: global_load_dwordx4 v[29:32], v63, s[0:1] offset:128
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v28
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:304 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v28
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:308 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v28
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:320 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v27
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:312 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v27
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:316 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v27
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:332 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v26
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:324 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v26
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:328 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v26
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:344 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v25
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:336 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v25
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:340 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v25
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:348 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v32
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:352 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v32
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:356 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v32
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:368 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v31
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:360 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v31
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:364 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v31
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:380 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v30
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:372 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v30
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:376 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v30
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:392 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v29
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:384 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v29
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:388 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v29
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:396 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[33:36], v63, s[0:1] offset:112
+; GFX906-NEXT: global_load_dwordx4 v[37:40], v63, s[0:1] offset:96
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v36
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:400 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v36
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:404 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v36
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:416 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v35
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:408 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v35
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:412 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v35
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:428 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v34
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:420 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v34
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:424 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v34
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:440 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v33
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:432 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v33
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:436 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v33
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:444 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v40
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:448 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v40
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:452 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v40
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:464 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v39
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:456 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v39
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:460 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v39
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:476 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v38
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:468 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v38
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:472 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v38
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:488 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v37
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:480 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v37
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:484 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v37
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:492 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[41:44], v63, s[0:1] offset:80
+; GFX906-NEXT: global_load_dwordx4 v[45:48], v63, s[0:1] offset:64
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v44
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:496 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v44
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:500 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v44
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:512 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v43
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:504 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v43
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:508 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v43
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:524 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v42
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:516 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v42
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:520 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v42
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:536 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v41
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:528 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v41
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:532 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v41
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:540 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v48
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:544 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v48
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:548 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v48
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:560 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v47
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:552 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v47
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:556 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v47
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:572 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v46
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:564 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v46
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:568 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v46
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:584 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v45
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:576 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v45
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:580 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v45
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:588 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[49:52], v63, s[0:1] offset:48
+; GFX906-NEXT: global_load_dwordx4 v[53:56], v63, s[0:1] offset:32
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v52
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:592 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v52
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:596 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v52
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:608 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v51
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:600 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v51
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:604 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v51
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:620 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v50
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:612 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v50
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:616 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v50
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:632 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v49
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:624 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v49
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:628 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v49
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:636 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v56
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:640 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v56
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:644 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v56
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:656 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v55
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:648 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v55
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:652 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v55
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:668 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v54
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:660 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v54
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:664 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v54
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:680 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v53
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:672 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v53
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:676 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v53
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:684 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[57:60], v63, s[0:1] offset:16
; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[0:1] offset:208
-; GFX906-NEXT: global_load_dwordx4 v[13:16], v61, s[0:1] offset:192
-; GFX906-NEXT: global_load_dwordx4 v[17:20], v61, s[0:1] offset:176
-; GFX906-NEXT: global_load_dwordx4 v[21:24], v61, s[0:1] offset:160
-; GFX906-NEXT: global_load_dwordx4 v[25:28], v61, s[0:1] offset:144
-; GFX906-NEXT: global_load_dwordx4 v[29:32], v61, s[0:1] offset:128
-; GFX906-NEXT: global_load_dwordx4 v[33:36], v61, s[0:1] offset:112
-; GFX906-NEXT: global_load_dwordx4 v[37:40], v61, s[0:1] offset:96
-; GFX906-NEXT: global_load_dwordx4 v[41:44], v61, s[0:1] offset:80
-; GFX906-NEXT: global_load_dwordx4 v[45:48], v61, s[0:1] offset:64
-; GFX906-NEXT: global_load_dwordx4 v[49:52], v61, s[0:1] offset:48
-; GFX906-NEXT: global_load_dwordx4 v[53:56], v61, s[0:1] offset:32
-; GFX906-NEXT: global_load_dwordx4 v[57:60], v61, s[0:1] offset:16
-; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[0:1]
+; GFX906-NEXT: global_load_dwordx4 v[0:3], v63, s[0:1]
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v60
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:688 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v60
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:692 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v60
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:704 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v59
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:696 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v59
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:700 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v59
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:716 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v58
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:708 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v58
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:712 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v58
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:728 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v57
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:720 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v57
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:724 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v57
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:732 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v3
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:736 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v3
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:740 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v3
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:752 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v2
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:744 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v2
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:748 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v2
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:764 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v0
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:756 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v1
+; GFX906-NEXT: buffer_store_dword v62, off, s[12:15], 0 offset:768 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v0
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:760 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v1
+; GFX906-NEXT: buffer_store_dword v62, off, s[12:15], 0 offset:772 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v0
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB6_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[2:3] offset:240
-; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: global_load_dwordx4 v[0:3], v63, s[2:3] offset:240
+; GFX906-NEXT: global_load_dwordx4 v[5:8], v63, s[2:3] offset:224
+; GFX906-NEXT: global_load_dwordx4 v[9:12], v63, s[2:3] offset:208
+; GFX906-NEXT: global_load_dwordx4 v[13:16], v63, s[2:3] offset:192
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v3
+; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v3
+; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v3
+; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v2
+; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v2
+; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v2
+; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v1
+; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v1
+; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 8, v1
+; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 24, v0
+; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:52 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v17, 16, v0
+; GFX906-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:56 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
; GFX906-NEXT: s_nop 0
; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[2:3] offset:224
-; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[2:3] offset:208
-; GFX906-NEXT: global_load_dwordx4 v[13:16], v61, s[2:3] offset:192
-; GFX906-NEXT: global_load_dwordx4 v[17:20], v61, s[2:3] offset:176
-; GFX906-NEXT: global_load_dwordx4 v[21:24], v61, s[2:3] offset:160
-; GFX906-NEXT: global_load_dwordx4 v[25:28], v61, s[2:3] offset:144
-; GFX906-NEXT: global_load_dwordx4 v[29:32], v61, s[2:3] offset:128
-; GFX906-NEXT: global_load_dwordx4 v[33:36], v61, s[2:3] offset:112
-; GFX906-NEXT: global_load_dwordx4 v[37:40], v61, s[2:3] offset:96
-; GFX906-NEXT: global_load_dwordx4 v[41:44], v61, s[2:3] offset:80
-; GFX906-NEXT: global_load_dwordx4 v[45:48], v61, s[2:3] offset:64
-; GFX906-NEXT: global_load_dwordx4 v[49:52], v61, s[2:3] offset:48
-; GFX906-NEXT: global_load_dwordx4 v[53:56], v61, s[2:3] offset:32
-; GFX906-NEXT: global_load_dwordx4 v[57:60], v61, s[2:3] offset:16
-; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[2:3]
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v0
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:60 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(18)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v8
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:64 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v8
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:68 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v8
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:72 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v7
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:76 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v7
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:80 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v7
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:84 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v6
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:88 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:92 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v6
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:96 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v5
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:100 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:104 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v5
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:108 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(29)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v12
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:112 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v12
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:116 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v12
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:120 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v11
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:124 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v11
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:128 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v11
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:132 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v10
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:136 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v10
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:140 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v10
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:144 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v9
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:148 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v9
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:152 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v9
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:156 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(40)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v16
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:160 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v16
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:164 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v16
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:168 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v15
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:176 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v15
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:180 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v15
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:172 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v14
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:188 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v14
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:192 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v14
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:184 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v13
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:200 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v13
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:204 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v13
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:196 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[17:20], v63, s[2:3] offset:176
+; GFX906-NEXT: global_load_dwordx4 v[21:24], v63, s[2:3] offset:160
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v20
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:208 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v20
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:212 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v20
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:224 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v19
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:216 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v19
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:220 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v19
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:236 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v18
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:228 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v18
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:232 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v18
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:248 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v17
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:240 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v17
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:244 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v17
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:252 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v24
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:256 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v24
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:260 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v24
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:272 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v23
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:264 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v23
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:268 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v23
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:284 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v22
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:276 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v22
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:280 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v22
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:296 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v21
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:288 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v21
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:292 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v21
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:300 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[25:28], v63, s[2:3] offset:144
+; GFX906-NEXT: global_load_dwordx4 v[29:32], v63, s[2:3] offset:128
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v28
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:304 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v28
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:308 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v28
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:320 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v27
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:312 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v27
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:316 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v27
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:332 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v26
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:324 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v26
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:328 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v26
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:344 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v25
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:336 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v25
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:340 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v25
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:348 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v32
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:352 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v32
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:356 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v32
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:368 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v31
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:360 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v31
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:364 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v31
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:380 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v30
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:372 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v30
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:376 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v30
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:392 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v29
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:384 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v29
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:388 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v29
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:396 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[33:36], v63, s[2:3] offset:112
+; GFX906-NEXT: global_load_dwordx4 v[37:40], v63, s[2:3] offset:96
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v36
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:400 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v36
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:404 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v36
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:416 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v35
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:408 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v35
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:412 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v35
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:428 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v34
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:420 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v34
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:424 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v34
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:440 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v33
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:432 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v33
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:436 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v33
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:444 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v40
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:448 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v40
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:452 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v40
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:464 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v39
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:456 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v39
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:460 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v39
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:476 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v38
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:468 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v38
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:472 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v38
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:488 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v37
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:480 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v37
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:484 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v37
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:492 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[41:44], v63, s[2:3] offset:80
+; GFX906-NEXT: global_load_dwordx4 v[45:48], v63, s[2:3] offset:64
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v44
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:496 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v44
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:500 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v44
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:512 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v43
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:504 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v43
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:508 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v43
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:524 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v42
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:516 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v42
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:520 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v42
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:536 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v41
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:528 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v41
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:532 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v41
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:540 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v48
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:544 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v48
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:548 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v48
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:560 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v47
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:552 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v47
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:556 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v47
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:572 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v46
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:564 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v46
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:568 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v46
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:584 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v45
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:576 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v45
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:580 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v45
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:588 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[49:52], v63, s[2:3] offset:48
+; GFX906-NEXT: global_load_dwordx4 v[53:56], v63, s[2:3] offset:32
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v52
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:592 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v52
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:596 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v52
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:608 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v51
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:600 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v51
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:604 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v51
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:620 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v50
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:612 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v50
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:616 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v50
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:632 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v49
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:624 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v49
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:628 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v49
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:636 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v56
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:640 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v56
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:644 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v56
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:656 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v55
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:648 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v55
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:652 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v55
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:668 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v54
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:660 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v54
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:664 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v54
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:680 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v53
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:672 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v53
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:676 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v53
+; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:684 ; 4-byte Folded Spill
+; GFX906-NEXT: global_load_dwordx4 v[57:60], v63, s[2:3] offset:16
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: global_load_dwordx4 v[0:3], v63, s[2:3]
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v60
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:688 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v60
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:692 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v60
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:704 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v59
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:696 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v59
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:700 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v59
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:716 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v58
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:708 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v58
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:712 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v58
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:728 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v57
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:720 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v57
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:724 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v57
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:732 ; 4-byte Folded Spill
+; GFX906-NEXT: s_waitcnt vmcnt(12)
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v3
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:736 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v3
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:740 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v3
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:752 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v2
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:744 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v2
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:748 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v2
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:764 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v0
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:756 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 16, v1
+; GFX906-NEXT: buffer_store_dword v62, off, s[12:15], 0 offset:768 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v0
+; GFX906-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:760 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v61, 8, v1
+; GFX906-NEXT: buffer_store_dword v62, off, s[12:15], 0 offset:772 ; 4-byte Folded Spill
+; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v0
; GFX906-NEXT: .LBB6_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: s_waitcnt vmcnt(7)
-; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[6:7] offset:112
-; GFX906-NEXT: s_waitcnt vmcnt(7)
-; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[6:7] offset:96
-; GFX906-NEXT: s_waitcnt vmcnt(7)
-; GFX906-NEXT: global_store_dwordx4 v4, v[41:44], s[6:7] offset:80
-; GFX906-NEXT: s_waitcnt vmcnt(7)
-; GFX906-NEXT: global_store_dwordx4 v4, v[45:48], s[6:7] offset:64
-; GFX906-NEXT: s_waitcnt vmcnt(7)
-; GFX906-NEXT: global_store_dwordx4 v4, v[49:52], s[6:7] offset:48
-; GFX906-NEXT: s_waitcnt vmcnt(7)
-; GFX906-NEXT: global_store_dwordx4 v4, v[53:56], s[6:7] offset:32
-; GFX906-NEXT: s_waitcnt vmcnt(7)
-; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[6:7] offset:16
-; GFX906-NEXT: s_waitcnt vmcnt(7)
+; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v61, off, s[12:15], 0 offset:764 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b16_e32 v62, 8, v62
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v62, off, s[12:15], 0 offset:772 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v63, off, s[12:15], 0 offset:760 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v61, off, s[12:15], 0 offset:752 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v61, off, s[12:15], 0 offset:768 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61
+; GFX906-NEXT: v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v62, off, s[12:15], 0 offset:756 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v61, off, s[12:15], 0 offset:744 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v62, 8, v62
+; GFX906-NEXT: v_or_b32_sdwa v62, v63, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v62, off, s[12:15], 0 offset:748 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v61, off, s[12:15], 0 offset:736 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v62, off, s[12:15], 0 offset:740 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v61, 8, v61
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
-; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:732 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:728 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:716 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:704 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: buffer_load_dword v57, off, s[12:15], 0 offset:720 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v58, off, s[12:15], 0 offset:724 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v59, off, s[12:15], 0 offset:712 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v57, 8, v57
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v58, off, s[12:15], 0 offset:708 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v57, off, s[12:15], 0 offset:696 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v58, 8, v58
+; GFX906-NEXT: v_or_b32_sdwa v58, v59, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v58, off, s[12:15], 0 offset:700 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v57, 8, v57
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v57, off, s[12:15], 0 offset:688 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v58, off, s[12:15], 0 offset:692 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v57, 8, v57
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:684 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:680 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:668 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:656 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: buffer_load_dword v53, off, s[12:15], 0 offset:672 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v54, off, s[12:15], 0 offset:676 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v55, off, s[12:15], 0 offset:664 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v53, 8, v53
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v54, off, s[12:15], 0 offset:660 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v53, off, s[12:15], 0 offset:648 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v54, 8, v54
+; GFX906-NEXT: v_or_b32_sdwa v54, v55, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v54, off, s[12:15], 0 offset:652 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v53, 8, v53
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v53, off, s[12:15], 0 offset:640 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v54, off, s[12:15], 0 offset:644 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v53, 8, v53
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:32
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:636 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:632 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:620 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:608 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: buffer_load_dword v49, off, s[12:15], 0 offset:624 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v50, off, s[12:15], 0 offset:628 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v51, off, s[12:15], 0 offset:616 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v49, 8, v49
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v50, off, s[12:15], 0 offset:612 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v49, off, s[12:15], 0 offset:600 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v50, 8, v50
+; GFX906-NEXT: v_or_b32_sdwa v50, v51, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v50, off, s[12:15], 0 offset:604 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v49, 8, v49
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v49, off, s[12:15], 0 offset:592 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v50, off, s[12:15], 0 offset:596 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v49, 8, v49
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:48
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:588 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:584 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:572 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:560 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: buffer_load_dword v45, off, s[12:15], 0 offset:576 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v46, off, s[12:15], 0 offset:580 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v47, off, s[12:15], 0 offset:568 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v45, 8, v45
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v46, off, s[12:15], 0 offset:564 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v45, off, s[12:15], 0 offset:552 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v46, 8, v46
+; GFX906-NEXT: v_or_b32_sdwa v46, v47, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v46, off, s[12:15], 0 offset:556 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v45, 8, v45
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v45, off, s[12:15], 0 offset:544 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v46, off, s[12:15], 0 offset:548 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v45, 8, v45
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:64
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:540 ; 4-byte Folded Reload
; GFX906-NEXT: s_nop 0
-; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:536 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:524 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:512 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: buffer_load_dword v41, off, s[12:15], 0 offset:528 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v42, off, s[12:15], 0 offset:532 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v43, off, s[12:15], 0 offset:520 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v41, 8, v41
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v42, off, s[12:15], 0 offset:516 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v41, off, s[12:15], 0 offset:504 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v42, 8, v42
+; GFX906-NEXT: v_or_b32_sdwa v42, v43, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v42, off, s[12:15], 0 offset:508 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v41, 8, v41
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v41, off, s[12:15], 0 offset:496 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v42, off, s[12:15], 0 offset:500 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v44, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v41, 8, v41
; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:80
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:492 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:488 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:476 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:464 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: buffer_load_dword v37, off, s[12:15], 0 offset:480 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v38, off, s[12:15], 0 offset:484 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v39, off, s[12:15], 0 offset:472 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v37, 8, v37
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v38, off, s[12:15], 0 offset:468 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v37, off, s[12:15], 0 offset:456 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v38, 8, v38
+; GFX906-NEXT: v_or_b32_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v38, off, s[12:15], 0 offset:460 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v37, 8, v37
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v37, off, s[12:15], 0 offset:448 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v38, off, s[12:15], 0 offset:452 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v37, 8, v37
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:96
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:444 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:440 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:428 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:416 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: buffer_load_dword v33, off, s[12:15], 0 offset:432 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v34, off, s[12:15], 0 offset:436 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v35, off, s[12:15], 0 offset:424 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v33, 8, v33
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v34, off, s[12:15], 0 offset:420 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v33, off, s[12:15], 0 offset:408 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v34, 8, v34
+; GFX906-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v34, off, s[12:15], 0 offset:412 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v33, 8, v33
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v33, off, s[12:15], 0 offset:400 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v34, off, s[12:15], 0 offset:404 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v33, 8, v33
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:112
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:396 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:392 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:380 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:368 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: buffer_load_dword v29, off, s[12:15], 0 offset:384 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v30, off, s[12:15], 0 offset:388 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v31, off, s[12:15], 0 offset:376 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v30, off, s[12:15], 0 offset:372 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v29, off, s[12:15], 0 offset:360 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v30, 8, v30
+; GFX906-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v30, off, s[12:15], 0 offset:364 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v29, off, s[12:15], 0 offset:352 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v30, off, s[12:15], 0 offset:356 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:128
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:348 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:344 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:332 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:320 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: buffer_load_dword v25, off, s[12:15], 0 offset:336 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v26, off, s[12:15], 0 offset:340 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v27, off, s[12:15], 0 offset:328 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v25, 8, v25
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v26, off, s[12:15], 0 offset:324 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v25, off, s[12:15], 0 offset:312 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v26, 8, v26
+; GFX906-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v26, off, s[12:15], 0 offset:316 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v25, 8, v25
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v25, off, s[12:15], 0 offset:304 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v26, off, s[12:15], 0 offset:308 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v25, 8, v25
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:144
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:300 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:296 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:284 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:272 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: buffer_load_dword v21, off, s[12:15], 0 offset:288 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v22, off, s[12:15], 0 offset:292 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v23, off, s[12:15], 0 offset:280 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v22, off, s[12:15], 0 offset:276 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v21, off, s[12:15], 0 offset:264 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v22, 8, v22
+; GFX906-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v22, off, s[12:15], 0 offset:268 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v21, off, s[12:15], 0 offset:256 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v22, off, s[12:15], 0 offset:260 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v21, 8, v21
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:160
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:252 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:248 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:236 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:224 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: buffer_load_dword v17, off, s[12:15], 0 offset:240 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:244 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v19, off, s[12:15], 0 offset:232 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v17, 8, v17
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:228 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v17, off, s[12:15], 0 offset:216 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v18, 8, v18
+; GFX906-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:220 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v17, 8, v17
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v17, off, s[12:15], 0 offset:208 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v18, off, s[12:15], 0 offset:212 ; 4-byte Folded Reload
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v17, 8, v17
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:176
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:200 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:204 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:192 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:184 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:188 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:164 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:196 ; 4-byte Folded Reload
+; GFX906-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:180 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:176 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:160 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:172 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v13, 8, v13
+; GFX906-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:168 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:192
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:156 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:152 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:148 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:140 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:128 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:144 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:136 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:132 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:124 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:116 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:120 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:112 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b16_e32 v9, 8, v9
+; GFX906-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:208
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:108 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:104 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:100 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:92 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(3)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:80 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:96 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:88 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:84 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:76 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:68 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:72 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:64 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; GFX906-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:224
+; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload
+; GFX906-NEXT: s_nop 0
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(7)
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT: s_waitcnt vmcnt(6)
+; GFX906-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(2)
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GFX906-NEXT: s_waitcnt vmcnt(1)
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; GFX906-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:240
-; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[6:7] offset:224
-; GFX906-NEXT: global_store_dwordx4 v4, v[9:12], s[6:7] offset:208
-; GFX906-NEXT: global_store_dwordx4 v4, v[13:16], s[6:7] offset:192
-; GFX906-NEXT: global_store_dwordx4 v4, v[17:20], s[6:7] offset:176
-; GFX906-NEXT: global_store_dwordx4 v4, v[21:24], s[6:7] offset:160
-; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[6:7] offset:144
-; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[6:7] offset:128
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -373,9 +2011,17 @@ define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr
; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX906-NEXT: global_load_dword v0, v0, s[2:3]
; GFX906-NEXT: .LBB7_5: ; %return.sink.split
-; GFX906-NEXT: v_mov_b32_e32 v1, 0
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX906-NEXT: v_lshrrev_b32_e32 v1, 24, v0
+; GFX906-NEXT: v_lshrrev_b32_e32 v3, 8, v0
+; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_mov_b32_e32 v4, 0
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dword v4, v0, s[6:7]
; GFX906-NEXT: .LBB7_6: ; %return
; GFX906-NEXT: s_endpgm
entry:
@@ -406,32 +2052,60 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-LABEL: v8i8_phi_chain:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX906-NEXT: v_lshlrev_b32_e32 v9, 3, v0
; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[8:9]
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v9, s[8:9]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v1
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX906-NEXT: s_cbranch_execz .LBB8_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[10:11]
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v9, s[10:11]
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
; GFX906-NEXT: s_and_b64 s[4:5], vcc, exec
; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v1
; GFX906-NEXT: .LBB8_2: ; %Flow
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT: v_lshlrev_b16_e32 v8, 8, v8
+; GFX906-NEXT: v_lshlrev_b16_e32 v7, 8, v7
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v6
+; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v5
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
; GFX906-NEXT: s_cbranch_execz .LBB8_4
; GFX906-NEXT: ; %bb.3: ; %bb.2
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[12:13]
+; GFX906-NEXT: v_or_b32_sdwa v9, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v10, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v10, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v11, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_mov_b32_e32 v6, 0
+; GFX906-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx2 v6, v[9:10], s[12:13]
; GFX906-NEXT: .LBB8_4: ; %bb.3
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15]
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_mov_b32_e32 v6, 0
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx2 v6, v[1:2], s[14:15]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -461,40 +2135,87 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa
; GFX906-LABEL: v8i8_phi_zeroinit:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0
+; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: ; implicit-def: $vgpr1_vgpr2
+; GFX906-NEXT: ; implicit-def: $vgpr3
+; GFX906-NEXT: ; implicit-def: $vgpr13
+; GFX906-NEXT: ; implicit-def: $vgpr11
+; GFX906-NEXT: ; implicit-def: $vgpr14
+; GFX906-NEXT: ; implicit-def: $vgpr15
+; GFX906-NEXT: ; implicit-def: $vgpr12
+; GFX906-NEXT: ; implicit-def: $vgpr16
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[8:9]
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[8:9]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v5, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v1
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX906-NEXT: s_cbranch_execz .LBB9_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[10:11]
-; GFX906-NEXT: s_mov_b32 s4, 0
+; GFX906-NEXT: global_load_dwordx2 v[3:4], v4, s[10:11]
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
-; GFX906-NEXT: s_mov_b32 s5, s4
-; GFX906-NEXT: s_waitcnt vmcnt(1)
-; GFX906-NEXT: v_mov_b32_e32 v3, s4
-; GFX906-NEXT: v_mov_b32_e32 v4, s5
+; GFX906-NEXT: v_mov_b32_e32 v1, 0
; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
; GFX906-NEXT: s_and_b64 s[4:5], vcc, exec
+; GFX906-NEXT: v_mov_b32_e32 v5, 0
+; GFX906-NEXT: v_mov_b32_e32 v9, 0
+; GFX906-NEXT: v_mov_b32_e32 v8, 0
+; GFX906-NEXT: v_mov_b32_e32 v6, 0
+; GFX906-NEXT: v_mov_b32_e32 v2, v1
+; GFX906-NEXT: v_mov_b32_e32 v7, v1
; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX906-NEXT: v_mov_b32_e32 v10, v1
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v16, 24, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v15, 8, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v14, 24, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v11, 16, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v13, 8, v3
; GFX906-NEXT: .LBB9_2: ; %Flow
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
; GFX906-NEXT: s_cbranch_execz .LBB9_4
; GFX906-NEXT: ; %bb.3: ; %bb.2
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_mov_b32_e32 v1, v3
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v9
+; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v6
+; GFX906-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v7
+; GFX906-NEXT: v_lshlrev_b16_e32 v11, 8, v5
+; GFX906-NEXT: v_or_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v11, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: v_mov_b32_e32 v2, v4
+; GFX906-NEXT: v_or_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[12:13]
+; GFX906-NEXT: v_mov_b32_e32 v3, v1
+; GFX906-NEXT: v_mov_b32_e32 v13, v9
+; GFX906-NEXT: v_mov_b32_e32 v11, v8
+; GFX906-NEXT: v_mov_b32_e32 v14, v6
+; GFX906-NEXT: v_mov_b32_e32 v4, v2
+; GFX906-NEXT: v_mov_b32_e32 v15, v7
+; GFX906-NEXT: v_mov_b32_e32 v12, v10
+; GFX906-NEXT: v_mov_b32_e32 v16, v5
; GFX906-NEXT: .LBB9_4: ; %bb.3
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15]
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v13
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v14
+; GFX906-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v15
+; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v16
+; GFX906-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_mov_b32_e32 v2, 0
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -632,18 +2353,24 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
; GFX906-LABEL: v8i8_multi_block:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX906-NEXT: v_lshlrev_b32_e32 v6, 3, v0
+; GFX906-NEXT: v_lshlrev_b32_e32 v12, 3, v0
; GFX906-NEXT: v_mov_b32_e32 v5, 0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dwordx2 v[3:4], v6, s[8:9]
+; GFX906-NEXT: global_load_dwordx2 v[3:4], v12, s[8:9]
; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 24, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 16, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v4
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v3
; GFX906-NEXT: v_mov_b32_e32 v1, v3
; GFX906-NEXT: v_mov_b32_e32 v2, v4
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB11_4
; GFX906-NEXT: ; %bb.1: ; %bb.1
-; GFX906-NEXT: global_load_dwordx2 v[1:2], v6, s[10:11]
+; GFX906-NEXT: global_load_dwordx2 v[1:2], v12, s[10:11]
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX906-NEXT: s_cbranch_execz .LBB11_3
@@ -652,10 +2379,26 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[12:13]
; GFX906-NEXT: .LBB11_3: ; %Flow
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT: s_waitcnt vmcnt(0)
+; GFX906-NEXT: v_lshrrev_b32_e32 v6, 24, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v2
+; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v10, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v11, 8, v1
; GFX906-NEXT: .LBB11_4: ; %bb.3
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: global_store_dwordx2 v5, v[1:2], s[14:15]
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v11
+; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v9
+; GFX906-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v8
+; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v6
+; GFX906-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT: global_store_dwordx2 v5, v[0:1], s[14:15]
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -685,22 +2428,27 @@ define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrsp
; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX906-NEXT: v_lshlrev_b32_e32 v1, 5, v0
; GFX906-NEXT: v_cmp_lt_u32_e32 vcc, 14, v0
-; GFX906-NEXT: s_mov_b32 s2, 0x2000604
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: global_load_dword v1, v1, s[0:1]
; GFX906-NEXT: s_mov_b64 s[0:1], 0
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_mov_b32_e32 v0, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v2
+; GFX906-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX906-NEXT: .LBB12_1: ; %bb.1
; GFX906-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX906-NEXT: s_and_b64 s[6:7], exec, vcc
-; GFX906-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1]
-; GFX906-NEXT: v_perm_b32 v0, v1, v0, s2
+; GFX906-NEXT: s_and_b64 s[2:3], exec, vcc
+; GFX906-NEXT: v_mov_b32_e32 v3, v2
+; GFX906-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX906-NEXT: v_mov_b32_e32 v2, v1
; GFX906-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX906-NEXT: s_cbranch_execnz .LBB12_1
; GFX906-NEXT: ; %bb.2: ; %bb.2.loopexit
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v3
+; GFX906-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX906-NEXT: v_mov_b32_e32 v1, 0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: global_store_dword v1, v0, s[0:1]
More information about the llvm-commits
mailing list