[llvm] [AMDGPU] Implement LSR cost model for GFX9+ (PR #184138)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 2 06:58:01 PST 2026
https://github.com/michaelselehov updated https://github.com/llvm/llvm-project/pull/184138
>From 929b04bc429171d6cd5bf60ac1018e223255de6f Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Fri, 27 Feb 2026 04:09:09 -0600
Subject: [PATCH 1/2] [AMDGPU] Implement LSR cost model for GFX9+
AMDGPU previously had no target-specific LSR cost model, so the generic
heuristic would often introduce extra induction variables and base-add
chains that hurt VALU throughput on GFX9+ (observed on gfx942).
Implement a custom cost model:
- isLSRCostLess(): prioritize per-iteration instruction count over
setup costs, penalize IV multiplies, and demote register count.
Pre-GFX9 falls back to the default comparator.
- getScalingFactorCost(): report that base+scale*index addressing
requires an extra ADD instruction.
- isNumRegsMajorCostOfLSR(): return false.
- shouldDropLSRSolutionIfLessProfitable(): return true.
Assisted-by: Claude Opus
---
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 42 +
.../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 10 +
llvm/test/CodeGen/AMDGPU/copy-to-reg.ll | 22 +-
llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll | 85 +-
llvm/test/CodeGen/AMDGPU/global-saddr-load.ll | 158 +-
llvm/test/CodeGen/AMDGPU/idiv-licm.ll | 322 +-
.../AMDGPU/lsr-cost-model-vector-iv.ll | 37 +
...p-var-out-of-divergent-loop-swdev407790.ll | 22 +-
...ne-sink-temporal-divergence-swdev407790.ll | 192 +-
.../CodeGen/AMDGPU/memintrinsic-unroll.ll | 13313 ++++++++--------
.../waitcnt-loop-ds-prefetch-pattern.ll | 37 +-
.../AMDGPU/lsr-invalid-ptr-extend.ll | 14 +-
.../AMDGPU/preserve-addrspace-assert.ll | 6 +-
13 files changed, 7158 insertions(+), 7102 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/lsr-cost-model-vector-iv.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index d746ce65a6288..d50dd5bbb0e0c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -16,6 +16,7 @@
#include "AMDGPUTargetTransformInfo.h"
#include "AMDGPUTargetMachine.h"
+#include "AMDGPUSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIModeRegisterDefaults.h"
#include "llvm/Analysis/InlineCost.h"
@@ -1703,3 +1704,44 @@ GCNTTIImpl::getInstructionUniformity(const Value *V) const {
return InstructionUniformity::Default;
}
+
+InstructionCost GCNTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
+ StackOffset BaseOffset,
+ bool HasBaseReg, int64_t Scale,
+ unsigned AddrSpace) const {
+ // AMDGPU has limited addressing modes. base+scale*index requires an extra
+ // ADD instruction, unlike architectures with rich addressing modes.
+ if (HasBaseReg && Scale != 0)
+ return 1;
+ return BaseT::getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
+ AddrSpace);
+}
+
+bool GCNTTIImpl::isLSRCostLess(const TTI::LSRCost &A,
+ const TTI::LSRCost &B) const {
+ // GFX9+: favor lower per-iteration work over preheader/setup costs.
+ // AMDGPU lacks rich addressing modes, so ScaleCost is folded into the
+ // effective instruction count (base+scale*index requires a separate ADD).
+ if (getST()->getGeneration() >= AMDGPUSubtarget::GFX9) {
+ unsigned EffInsnsA = A.Insns + A.ScaleCost;
+ unsigned EffInsnsB = B.Insns + B.ScaleCost;
+
+ return std::tie(EffInsnsA, A.NumIVMuls, A.AddRecCost, A.NumBaseAdds,
+ A.SetupCost, A.ImmCost, A.NumRegs) <
+ std::tie(EffInsnsB, B.NumIVMuls, B.AddRecCost, B.NumBaseAdds,
+ B.SetupCost, B.ImmCost, B.NumRegs);
+ }
+
+ // Pre-GFX9: keep the default behavior.
+ return BaseT::isLSRCostLess(A, B);
+}
+
+bool GCNTTIImpl::isNumRegsMajorCostOfLSR() const {
+ // isLSRCostLess de-prioritizes register count; keep consistent.
+ return false;
+}
+
+bool GCNTTIImpl::shouldDropLSRSolutionIfLessProfitable() const {
+ // Prefer the baseline when LSR cannot clearly reduce per-iteration work.
+ return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 3ec157aacd0aa..24d4ec8d85d45 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -311,6 +311,16 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
unsigned getNumberOfParts(Type *Tp) const override;
InstructionUniformity getInstructionUniformity(const Value *V) const override;
+
+ InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
+ StackOffset BaseOffset,
+ bool HasBaseReg, int64_t Scale,
+ unsigned AddrSpace) const override;
+
+ bool isLSRCostLess(const TTI::LSRCost &A,
+ const TTI::LSRCost &B) const override;
+ bool isNumRegsMajorCostOfLSR() const override;
+ bool shouldDropLSRSolutionIfLessProfitable() const override;
};
} // end namespace llvm
diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll
index 931a14473c340..f5223d5553c6a 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll
@@ -17,14 +17,13 @@ define amdgpu_kernel void @copy_to_reg_frameindex(ptr addrspace(1) %out, i32 %a,
; GFX7-NEXT: s_add_u32 s12, s12, s11
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_mov_b32 s0, 0
-; GFX7-NEXT: s_mov_b32 s1, 0
; GFX7-NEXT: .LBB0_1: ; %loop
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mov_b32_e32 v0, s1
-; GFX7-NEXT: v_mov_b32_e32 v1, s0
-; GFX7-NEXT: s_add_i32 s1, s1, 1
-; GFX7-NEXT: s_add_i32 s0, s0, 4
-; GFX7-NEXT: s_cmp_lt_u32 s1, 16
+; GFX7-NEXT: s_lshl_b32 s1, s0, 2
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: s_add_i32 s0, s0, 1
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: s_cmp_lt_u32 s0, 16
; GFX7-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
; GFX7-NEXT: s_cbranch_scc1 .LBB0_1
; GFX7-NEXT: ; %bb.2: ; %done
@@ -45,14 +44,13 @@ define amdgpu_kernel void @copy_to_reg_frameindex(ptr addrspace(1) %out, i32 %a,
; GFX8-NEXT: s_add_u32 s88, s88, s11
; GFX8-NEXT: s_addc_u32 s89, s89, 0
; GFX8-NEXT: s_mov_b32 s0, 0
-; GFX8-NEXT: s_mov_b32 s1, 0
; GFX8-NEXT: .LBB0_1: ; %loop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: s_add_i32 s1, s1, 1
-; GFX8-NEXT: s_add_i32 s0, s0, 4
-; GFX8-NEXT: s_cmp_lt_u32 s1, 16
+; GFX8-NEXT: s_lshl_b32 s1, s0, 2
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: s_add_i32 s0, s0, 1
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_cmp_lt_u32 s0, 16
; GFX8-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
; GFX8-NEXT: s_cbranch_scc1 .LBB0_1
; GFX8-NEXT: ; %bb.2: ; %done
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
index 8b7c49b5931af..2b093539b9e6d 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
@@ -2741,16 +2741,15 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
; GFX1250-SDAG: ; %bb.0: ; %bb
; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX1250-SDAG-NEXT: s_movk_i32 s0, 0x100
; GFX1250-SDAG-NEXT: .LBB116_1: ; %bb3
; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
-; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
-; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400
+; GFX1250-SDAG-NEXT: s_add_co_i32 s0, s0, -1
+; GFX1250-SDAG-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4
+; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0
; GFX1250-SDAG-NEXT: s_cbranch_scc0 .LBB116_1
; GFX1250-SDAG-NEXT: ; %bb.2: ; %bb2
; GFX1250-SDAG-NEXT: s_endpgm
@@ -2758,19 +2757,19 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
; GFX1250-GISEL-LABEL: flat_addr_64bit_lsr_iv:
; GFX1250-GISEL: ; %bb.0: ; %bb
; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX1250-GISEL-NEXT: s_movk_i32 s0, 0x100
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX1250-GISEL-NEXT: .LBB116_1: ; %bb3
; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
-; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 4, v[2:3]
-; GFX1250-GISEL-NEXT: flat_load_b32 v4, v[4:5] scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x400, v2
+; GFX1250-GISEL-NEXT: v_add_co_u32 v0, s0, v0, 4
+; GFX1250-GISEL-NEXT: v_add_nc_u32_e32 v2, -1, v2
+; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX1250-GISEL-NEXT: s_cbranch_vccz .LBB116_1
; GFX1250-GISEL-NEXT: ; %bb.2: ; %bb2
; GFX1250-GISEL-NEXT: s_endpgm
@@ -2779,16 +2778,15 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
; GFX1250-NOECC: ; %bb.0: ; %bb
; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NOECC-NEXT: s_mov_b64 s[0:1], 0
+; GFX1250-NOECC-NEXT: s_movk_i32 s0, 0x100
; GFX1250-NOECC-NEXT: .LBB116_1: ; %bb3
; GFX1250-NOECC-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-NOECC-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NOECC-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
-; GFX1250-NOECC-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
; GFX1250-NOECC-NEXT: s_wait_dscnt 0x0
-; GFX1250-NOECC-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX1250-NOECC-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX1250-NOECC-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NOECC-NEXT: s_cmp_eq_u32 s0, 0x400
+; GFX1250-NOECC-NEXT: s_add_co_i32 s0, s0, -1
+; GFX1250-NOECC-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4
+; GFX1250-NOECC-NEXT: s_cmp_eq_u32 s0, 0
; GFX1250-NOECC-NEXT: s_cbranch_scc0 .LBB116_1
; GFX1250-NOECC-NEXT: ; %bb.2: ; %bb2
; GFX1250-NOECC-NEXT: s_endpgm
@@ -2815,19 +2813,17 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
; GFX1250-SDAG: ; %bb.0: ; %bb
; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX1250-SDAG-NEXT: s_movk_i32 s0, 0x100
; GFX1250-SDAG-NEXT: .LBB117_1: ; %bb3
; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
-; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
-; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400
-; GFX1250-SDAG-NEXT: ; kill: killed $sgpr4_sgpr5
+; GFX1250-SDAG-NEXT: s_add_co_i32 s0, s0, -1
+; GFX1250-SDAG-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4
+; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0
; GFX1250-SDAG-NEXT: s_cbranch_scc0 .LBB117_1
; GFX1250-SDAG-NEXT: ; %bb.2: ; %bb2
; GFX1250-SDAG-NEXT: s_endpgm
@@ -2835,22 +2831,21 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
; GFX1250-GISEL-LABEL: flat_addr_64bit_lsr_iv_multiload:
; GFX1250-GISEL: ; %bb.0: ; %bb
; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX1250-GISEL-NEXT: s_movk_i32 s0, 0x100
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX1250-GISEL-NEXT: .LBB117_1: ; %bb3
; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
-; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 4, v[2:3]
-; GFX1250-GISEL-NEXT: ; kill: killed $vgpr4 killed $vgpr5
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: flat_load_b32 v6, v[4:5] scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT: flat_load_b32 v6, v[4:5] scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x400, v2
+; GFX1250-GISEL-NEXT: v_add_co_u32 v0, s0, v0, 4
+; GFX1250-GISEL-NEXT: v_add_nc_u32_e32 v2, -1, v2
+; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX1250-GISEL-NEXT: s_cbranch_vccz .LBB117_1
; GFX1250-GISEL-NEXT: ; %bb.2: ; %bb2
; GFX1250-GISEL-NEXT: s_endpgm
@@ -2859,19 +2854,17 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
; GFX1250-NOECC: ; %bb.0: ; %bb
; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NOECC-NEXT: s_mov_b64 s[0:1], 0
+; GFX1250-NOECC-NEXT: s_movk_i32 s0, 0x100
; GFX1250-NOECC-NEXT: .LBB117_1: ; %bb3
; GFX1250-NOECC-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-NOECC-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NOECC-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
-; GFX1250-NOECC-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
; GFX1250-NOECC-NEXT: s_wait_dscnt 0x0
-; GFX1250-NOECC-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX1250-NOECC-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NOECC-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX1250-NOECC-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX1250-NOECC-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NOECC-NEXT: s_cmp_eq_u32 s0, 0x400
-; GFX1250-NOECC-NEXT: ; kill: killed $sgpr4_sgpr5
+; GFX1250-NOECC-NEXT: s_add_co_i32 s0, s0, -1
+; GFX1250-NOECC-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4
+; GFX1250-NOECC-NEXT: s_cmp_eq_u32 s0, 0
; GFX1250-NOECC-NEXT: s_cbranch_scc0 .LBB117_1
; GFX1250-NOECC-NEXT: ; %bb.2: ; %bb2
; GFX1250-NOECC-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 49de34820c4c0..e792f2e7e6d8e 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -4714,17 +4714,16 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr a
define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
; GFX9-LABEL: global_addr_64bit_lsr_iv:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-NEXT: s_movk_i32 s0, 0x100
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: .LBB132_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_add_u32 s4, s2, s0
-; GFX9-NEXT: s_addc_u32 s5, s3, s1
-; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 4
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400
+; GFX9-NEXT: s_add_i32 s0, s0, -1
+; GFX9-NEXT: s_add_u32 s2, s2, 4
+; GFX9-NEXT: s_addc_u32 s3, s3, 0
+; GFX9-NEXT: s_cmp_eq_u32 s0, 0
; GFX9-NEXT: s_cbranch_scc0 .LBB132_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
@@ -4732,17 +4731,16 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
; GFX10-LABEL: global_addr_64bit_lsr_iv:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_mov_b64 s[0:1], 0
+; GFX10-NEXT: s_movk_i32 s0, 0x100
; GFX10-NEXT: .LBB132_1: ; %bb3
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
-; GFX10-NEXT: s_add_u32 s4, s2, s0
-; GFX10-NEXT: s_addc_u32 s5, s3, s1
-; GFX10-NEXT: s_add_u32 s0, s0, 4
-; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400
+; GFX10-NEXT: s_add_i32 s0, s0, -1
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX10-NEXT: s_add_u32 s2, s2, 4
+; GFX10-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-NEXT: s_cmp_eq_u32 s0, 0
; GFX10-NEXT: s_cbranch_scc0 .LBB132_1
; GFX10-NEXT: ; %bb.2: ; %bb2
; GFX10-NEXT: s_endpgm
@@ -4750,17 +4748,15 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
; GFX11-LABEL: global_addr_64bit_lsr_iv:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_mov_b64 s[0:1], 0
+; GFX11-NEXT: s_movk_i32 s0, 0x100
; GFX11-NEXT: .LBB132_1: ; %bb3
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_u32 s4, s2, s0
-; GFX11-NEXT: s_addc_u32 s5, s3, s1
-; GFX11-NEXT: s_add_u32 s0, s0, 4
-; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_addc_u32 s1, s1, 0
-; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_add_i32 s0, s0, -1
+; GFX11-NEXT: s_add_u32 s2, s2, 4
+; GFX11-NEXT: s_addc_u32 s3, s3, 0
+; GFX11-NEXT: s_cmp_eq_u32 s0, 0
; GFX11-NEXT: s_cbranch_scc0 .LBB132_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_endpgm
@@ -4768,38 +4764,34 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
; GFX12-SDAG-LABEL: global_addr_64bit_lsr_iv:
; GFX12-SDAG: ; %bb.0: ; %bb
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x100
; GFX12-SDAG-NEXT: .LBB132_1: ; %bb3
; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
-; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400
+; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -1
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4
+; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0
; GFX12-SDAG-NEXT: s_cbranch_scc0 .LBB132_1
; GFX12-SDAG-NEXT: ; %bb.2: ; %bb2
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: global_addr_64bit_lsr_iv:
; GFX12-GISEL: ; %bb.0: ; %bb
-; GFX12-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x100
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX12-GISEL-NEXT: .LBB132_1: ; %bb3
; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2
-; GFX12-GISEL-NEXT: s_wait_alu depctr_va_vcc(0)
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc
-; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc, v2, 4
-; GFX12-GISEL-NEXT: s_wait_alu depctr_va_vcc(0)
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc
-; GFX12-GISEL-NEXT: global_load_b32 v4, v[4:5], off scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: global_load_b32 v3, v[0:1], off scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0x400, v2
+; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v2, -1, v2
+; GFX12-GISEL-NEXT: v_add_co_u32 v0, s[0:1], v0, 4
+; GFX12-GISEL-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s[0:1]
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX12-GISEL-NEXT: s_cbranch_vccz .LBB132_1
; GFX12-GISEL-NEXT: ; %bb.2: ; %bb2
; GFX12-GISEL-NEXT: s_endpgm
@@ -4824,20 +4816,18 @@ bb3: ; preds = %bb3, %bb
define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg %arg, ptr addrspace(1) inreg %arg.1) {
; GFX9-LABEL: global_addr_64bit_lsr_iv_multiload:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-NEXT: s_movk_i32 s0, 0x100
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: .LBB133_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_add_u32 s4, s2, s0
-; GFX9-NEXT: s_addc_u32 s5, s3, s1
-; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 4
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400
-; GFX9-NEXT: ; kill: killed $sgpr4 killed $sgpr5
+; GFX9-NEXT: s_add_i32 s0, s0, -1
+; GFX9-NEXT: s_add_u32 s2, s2, 4
+; GFX9-NEXT: s_addc_u32 s3, s3, 0
+; GFX9-NEXT: s_cmp_eq_u32 s0, 0
; GFX9-NEXT: s_cbranch_scc0 .LBB133_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
@@ -4845,20 +4835,18 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
; GFX10-LABEL: global_addr_64bit_lsr_iv_multiload:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_mov_b64 s[0:1], 0
+; GFX10-NEXT: s_movk_i32 s0, 0x100
; GFX10-NEXT: .LBB133_1: ; %bb3
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
-; GFX10-NEXT: s_add_u32 s4, s2, s0
-; GFX10-NEXT: s_addc_u32 s5, s3, s1
-; GFX10-NEXT: s_add_u32 s0, s0, 4
-; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400
-; GFX10-NEXT: ; kill: killed $sgpr4 killed $sgpr5
+; GFX10-NEXT: s_add_i32 s0, s0, -1
+; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX10-NEXT: s_add_u32 s2, s2, 4
+; GFX10-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-NEXT: s_cmp_eq_u32 s0, 0
; GFX10-NEXT: s_cbranch_scc0 .LBB133_1
; GFX10-NEXT: ; %bb.2: ; %bb2
; GFX10-NEXT: s_endpgm
@@ -4866,19 +4854,17 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
; GFX11-LABEL: global_addr_64bit_lsr_iv_multiload:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_mov_b64 s[0:1], 0
+; GFX11-NEXT: s_movk_i32 s0, 0x100
; GFX11-NEXT: .LBB133_1: ; %bb3
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_u32 s4, s2, s0
-; GFX11-NEXT: s_addc_u32 s5, s3, s1
-; GFX11-NEXT: s_add_u32 s0, s0, 4
-; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_addc_u32 s1, s1, 0
-; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400
+; GFX11-NEXT: s_add_i32 s0, s0, -1
+; GFX11-NEXT: s_add_u32 s2, s2, 4
+; GFX11-NEXT: s_addc_u32 s3, s3, 0
+; GFX11-NEXT: s_cmp_eq_u32 s0, 0
; GFX11-NEXT: s_cbranch_scc0 .LBB133_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_endpgm
@@ -4886,42 +4872,38 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
; GFX12-SDAG-LABEL: global_addr_64bit_lsr_iv_multiload:
; GFX12-SDAG: ; %bb.0: ; %bb
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x100
; GFX12-SDAG-NEXT: .LBB133_1: ; %bb3
; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
-; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400
+; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -1
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4
+; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0
; GFX12-SDAG-NEXT: s_cbranch_scc0 .LBB133_1
; GFX12-SDAG-NEXT: ; %bb.2: ; %bb2
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: global_addr_64bit_lsr_iv_multiload:
; GFX12-GISEL: ; %bb.0: ; %bb
-; GFX12-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x100
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX12-GISEL-NEXT: .LBB133_1: ; %bb3
; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2
-; GFX12-GISEL-NEXT: s_wait_alu depctr_va_vcc(0)
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc
-; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc, v2, 4
-; GFX12-GISEL-NEXT: s_wait_alu depctr_va_vcc(0)
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc
-; GFX12-GISEL-NEXT: global_load_b32 v6, v[4:5], off scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: global_load_b32 v3, v[0:1], off scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_load_b32 v4, v[4:5], off scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: global_load_b32 v3, v[0:1], off scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0x400, v2
+; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v2, -1, v2
+; GFX12-GISEL-NEXT: v_add_co_u32 v0, s[0:1], v0, 4
+; GFX12-GISEL-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s[0:1]
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX12-GISEL-NEXT: s_cbranch_vccz .LBB133_1
; GFX12-GISEL-NEXT: ; %bb.2: ; %bb2
; GFX12-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index b018fd3545374..6f9531ecfa1b8 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -8,44 +8,39 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_mov_b32 s3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX9-NEXT: s_sub_i32 s4, 0, s6
+; GFX9-NEXT: s_sub_i32 s2, 0, s6
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
-; GFX9-NEXT: s_mul_i32 s4, s4, s5
-; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
-; GFX9-NEXT: s_add_i32 s8, s5, s4
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: s_mul_i32 s2, s2, s4
+; GFX9-NEXT: s_mul_hi_u32 s2, s4, s2
+; GFX9-NEXT: s_add_i32 s4, s4, s2
+; GFX9-NEXT: s_mov_b32 s2, s3
; GFX9-NEXT: .LBB0_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_not_b32 s10, s5
-; GFX9-NEXT: s_mul_i32 s9, s6, s5
-; GFX9-NEXT: s_mul_i32 s10, s6, s10
-; GFX9-NEXT: s_add_i32 s11, s5, 1
-; GFX9-NEXT: s_sub_i32 s9, s7, s9
-; GFX9-NEXT: s_add_i32 s10, s7, s10
-; GFX9-NEXT: s_cmp_ge_u32 s9, s6
-; GFX9-NEXT: s_cselect_b32 s11, s11, s5
-; GFX9-NEXT: s_cselect_b32 s9, s10, s9
-; GFX9-NEXT: s_add_i32 s10, s11, 1
-; GFX9-NEXT: s_cmp_ge_u32 s9, s6
-; GFX9-NEXT: s_cselect_b32 s9, s10, s11
-; GFX9-NEXT: s_add_u32 s10, s0, s2
-; GFX9-NEXT: s_addc_u32 s11, s1, s3
-; GFX9-NEXT: s_add_i32 s7, s7, 1
-; GFX9-NEXT: s_add_u32 s4, s4, s8
-; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_add_u32 s2, s2, 4
-; GFX9-NEXT: s_addc_u32 s3, s3, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000
-; GFX9-NEXT: global_store_dword v0, v1, s[10:11]
+; GFX9-NEXT: s_mul_hi_u32 s5, s2, s4
+; GFX9-NEXT: s_mul_i32 s7, s5, s6
+; GFX9-NEXT: s_sub_i32 s7, s2, s7
+; GFX9-NEXT: s_add_i32 s8, s5, 1
+; GFX9-NEXT: s_sub_i32 s9, s7, s6
+; GFX9-NEXT: s_cmp_ge_u32 s7, s6
+; GFX9-NEXT: s_cselect_b32 s5, s8, s5
+; GFX9-NEXT: s_cselect_b32 s7, s9, s7
+; GFX9-NEXT: s_add_i32 s8, s5, 1
+; GFX9-NEXT: s_cmp_ge_u32 s7, s6
+; GFX9-NEXT: s_cselect_b32 s5, s8, s5
+; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], 2
+; GFX9-NEXT: s_add_u32 s8, s0, s8
+; GFX9-NEXT: s_addc_u32 s9, s1, s9
+; GFX9-NEXT: s_add_i32 s2, s2, 1
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400
+; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
; GFX9-NEXT: s_cbranch_scc0 .LBB0_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
@@ -55,45 +50,40 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX10-NEXT: s_mov_b32 s7, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX10-NEXT: s_sub_i32 s2, 0, s6
+; GFX10-NEXT: s_sub_i32 s3, 0, s6
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-NEXT: v_readfirstlane_b32 s2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_mul_i32 s2, s2, s4
-; GFX10-NEXT: s_mul_hi_u32 s5, s4, s2
-; GFX10-NEXT: s_mov_b64 s[2:3], 0
-; GFX10-NEXT: s_add_i32 s8, s4, s5
-; GFX10-NEXT: s_mov_b64 s[4:5], 0
+; GFX10-NEXT: s_mul_i32 s3, s3, s2
+; GFX10-NEXT: s_mul_hi_u32 s4, s2, s3
+; GFX10-NEXT: s_mov_b32 s3, 0
+; GFX10-NEXT: s_add_i32 s4, s2, s4
+; GFX10-NEXT: s_mov_b32 s2, s3
; GFX10-NEXT: .LBB0_1: ; %bb3
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_mul_hi_u32 s5, s2, s4
+; GFX10-NEXT: s_mul_i32 s7, s5, s6
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
-; GFX10-NEXT: s_not_b32 s10, s5
-; GFX10-NEXT: s_mul_i32 s9, s6, s5
-; GFX10-NEXT: s_mul_i32 s10, s6, s10
-; GFX10-NEXT: s_sub_i32 s9, s7, s9
-; GFX10-NEXT: s_add_i32 s11, s5, 1
-; GFX10-NEXT: s_add_i32 s10, s7, s10
-; GFX10-NEXT: s_cmp_ge_u32 s9, s6
-; GFX10-NEXT: s_cselect_b32 s11, s11, s5
-; GFX10-NEXT: s_cselect_b32 s9, s10, s9
-; GFX10-NEXT: s_add_i32 s10, s11, 1
-; GFX10-NEXT: s_cmp_ge_u32 s9, s6
-; GFX10-NEXT: s_cselect_b32 s9, s10, s11
-; GFX10-NEXT: s_add_u32 s10, s0, s2
-; GFX10-NEXT: s_addc_u32 s11, s1, s3
-; GFX10-NEXT: s_add_i32 s7, s7, 1
-; GFX10-NEXT: s_add_u32 s4, s4, s8
-; GFX10-NEXT: v_mov_b32_e32 v1, s9
-; GFX10-NEXT: s_addc_u32 s5, s5, 0
-; GFX10-NEXT: s_add_u32 s2, s2, 4
-; GFX10-NEXT: s_addc_u32 s3, s3, 0
-; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000
-; GFX10-NEXT: global_store_dword v0, v1, s[10:11]
+; GFX10-NEXT: s_add_i32 s8, s5, 1
+; GFX10-NEXT: s_sub_i32 s7, s2, s7
+; GFX10-NEXT: s_sub_i32 s9, s7, s6
+; GFX10-NEXT: s_cmp_ge_u32 s7, s6
+; GFX10-NEXT: s_cselect_b32 s5, s8, s5
+; GFX10-NEXT: s_cselect_b32 s7, s9, s7
+; GFX10-NEXT: s_add_i32 s8, s5, 1
+; GFX10-NEXT: s_cmp_ge_u32 s7, s6
+; GFX10-NEXT: s_cselect_b32 s5, s8, s5
+; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], 2
+; GFX10-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-NEXT: s_add_u32 s8, s0, s8
+; GFX10-NEXT: s_addc_u32 s9, s1, s9
+; GFX10-NEXT: s_add_i32 s2, s2, 1
+; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x400
+; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
; GFX10-NEXT: s_cbranch_scc0 .LBB0_1
; GFX10-NEXT: ; %bb.2: ; %bb2
; GFX10-NEXT: s_endpgm
@@ -103,49 +93,46 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: s_mov_b32 s7, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX11-NEXT: s_sub_i32 s2, 0, s6
+; GFX11-NEXT: s_sub_i32 s3, 0, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s2, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_mul_i32 s2, s2, s4
-; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2
-; GFX11-NEXT: s_mov_b64 s[2:3], 0
-; GFX11-NEXT: s_add_i32 s8, s4, s5
-; GFX11-NEXT: s_mov_b64 s[4:5], 0
+; GFX11-NEXT: s_mul_i32 s3, s3, s2
+; GFX11-NEXT: s_mul_hi_u32 s4, s2, s3
+; GFX11-NEXT: s_mov_b32 s3, 0
+; GFX11-NEXT: s_add_i32 s4, s2, s4
+; GFX11-NEXT: s_mov_b32 s2, s3
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB0_1: ; %bb3
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_mul_hi_u32 s5, s2, s4
+; GFX11-NEXT: s_mul_i32 s7, s5, s6
+; GFX11-NEXT: s_add_i32 s8, s5, 1
+; GFX11-NEXT: s_sub_i32 s7, s2, s7
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_sub_i32 s9, s7, s6
+; GFX11-NEXT: s_cmp_ge_u32 s7, s6
+; GFX11-NEXT: s_cselect_b32 s5, s8, s5
+; GFX11-NEXT: s_cselect_b32 s7, s9, s7
+; GFX11-NEXT: s_add_i32 s8, s5, 1
+; GFX11-NEXT: s_cmp_ge_u32 s7, s6
+; GFX11-NEXT: s_cselect_b32 s5, s8, s5
+; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], 2
+; GFX11-NEXT: v_mov_b32_e32 v1, s5
+; GFX11-NEXT: s_add_u32 s8, s0, s8
+; GFX11-NEXT: s_addc_u32 s9, s1, s9
+; GFX11-NEXT: s_add_i32 s2, s2, 1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_not_b32 s10, s5
-; GFX11-NEXT: s_mul_i32 s9, s6, s5
-; GFX11-NEXT: s_mul_i32 s10, s6, s10
-; GFX11-NEXT: s_sub_i32 s9, s7, s9
-; GFX11-NEXT: s_add_i32 s11, s5, 1
-; GFX11-NEXT: s_add_i32 s10, s7, s10
-; GFX11-NEXT: s_cmp_ge_u32 s9, s6
-; GFX11-NEXT: s_cselect_b32 s11, s11, s5
-; GFX11-NEXT: s_cselect_b32 s9, s10, s9
-; GFX11-NEXT: s_add_i32 s10, s11, 1
-; GFX11-NEXT: s_cmp_ge_u32 s9, s6
-; GFX11-NEXT: s_cselect_b32 s9, s10, s11
-; GFX11-NEXT: s_add_u32 s10, s0, s2
-; GFX11-NEXT: s_addc_u32 s11, s1, s3
-; GFX11-NEXT: s_add_i32 s7, s7, 1
-; GFX11-NEXT: s_add_u32 s4, s4, s8
-; GFX11-NEXT: v_mov_b32_e32 v1, s9
-; GFX11-NEXT: s_addc_u32 s5, s5, 0
-; GFX11-NEXT: s_add_u32 s2, s2, 4
-; GFX11-NEXT: s_addc_u32 s3, s3, 0
-; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000
-; GFX11-NEXT: global_store_b32 v0, v1, s[10:11]
+; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x400
+; GFX11-NEXT: global_store_b32 v0, v1, s[8:9]
; GFX11-NEXT: s_cbranch_scc0 .LBB0_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_endpgm
@@ -171,42 +158,37 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_mov_b32 s3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX9-NEXT: s_sub_i32 s4, 0, s6
+; GFX9-NEXT: s_sub_i32 s2, 0, s6
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
-; GFX9-NEXT: s_mul_i32 s4, s4, s5
-; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
-; GFX9-NEXT: s_add_i32 s8, s5, s4
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: s_mul_i32 s2, s2, s4
+; GFX9-NEXT: s_mul_hi_u32 s2, s4, s2
+; GFX9-NEXT: s_add_i32 s4, s4, s2
+; GFX9-NEXT: s_mov_b32 s2, s3
; GFX9-NEXT: .LBB1_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_not_b32 s10, s5
-; GFX9-NEXT: s_mul_i32 s9, s6, s5
-; GFX9-NEXT: s_mul_i32 s10, s6, s10
-; GFX9-NEXT: s_sub_i32 s9, s7, s9
-; GFX9-NEXT: s_add_i32 s10, s7, s10
-; GFX9-NEXT: s_cmp_ge_u32 s9, s6
-; GFX9-NEXT: s_cselect_b32 s9, s10, s9
-; GFX9-NEXT: s_sub_i32 s10, s9, s6
-; GFX9-NEXT: s_cmp_ge_u32 s9, s6
-; GFX9-NEXT: s_cselect_b32 s9, s10, s9
-; GFX9-NEXT: s_add_u32 s10, s0, s2
-; GFX9-NEXT: s_addc_u32 s11, s1, s3
-; GFX9-NEXT: s_add_i32 s7, s7, 1
-; GFX9-NEXT: s_add_u32 s4, s4, s8
-; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_add_u32 s2, s2, 4
-; GFX9-NEXT: s_addc_u32 s3, s3, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000
-; GFX9-NEXT: global_store_dword v0, v1, s[10:11]
+; GFX9-NEXT: s_mul_hi_u32 s5, s2, s4
+; GFX9-NEXT: s_mul_i32 s5, s5, s6
+; GFX9-NEXT: s_sub_i32 s5, s2, s5
+; GFX9-NEXT: s_sub_i32 s7, s5, s6
+; GFX9-NEXT: s_cmp_ge_u32 s5, s6
+; GFX9-NEXT: s_cselect_b32 s5, s7, s5
+; GFX9-NEXT: s_sub_i32 s7, s5, s6
+; GFX9-NEXT: s_cmp_ge_u32 s5, s6
+; GFX9-NEXT: s_cselect_b32 s5, s7, s5
+; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], 2
+; GFX9-NEXT: s_add_u32 s8, s0, s8
+; GFX9-NEXT: s_addc_u32 s9, s1, s9
+; GFX9-NEXT: s_add_i32 s2, s2, 1
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400
+; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
; GFX9-NEXT: s_cbranch_scc0 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
@@ -216,43 +198,38 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX10-NEXT: s_mov_b32 s7, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX10-NEXT: s_sub_i32 s2, 0, s6
+; GFX10-NEXT: s_sub_i32 s3, 0, s6
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-NEXT: v_readfirstlane_b32 s2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_mul_i32 s2, s2, s4
-; GFX10-NEXT: s_mul_hi_u32 s5, s4, s2
-; GFX10-NEXT: s_mov_b64 s[2:3], 0
-; GFX10-NEXT: s_add_i32 s8, s4, s5
-; GFX10-NEXT: s_mov_b64 s[4:5], 0
+; GFX10-NEXT: s_mul_i32 s3, s3, s2
+; GFX10-NEXT: s_mul_hi_u32 s4, s2, s3
+; GFX10-NEXT: s_mov_b32 s3, 0
+; GFX10-NEXT: s_add_i32 s4, s2, s4
+; GFX10-NEXT: s_mov_b32 s2, s3
; GFX10-NEXT: .LBB1_1: ; %bb3
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_not_b32 s9, s5
+; GFX10-NEXT: s_mul_hi_u32 s5, s2, s4
+; GFX10-NEXT: s_mul_i32 s5, s5, s6
+; GFX10-NEXT: s_sub_i32 s5, s2, s5
+; GFX10-NEXT: s_sub_i32 s7, s5, s6
+; GFX10-NEXT: s_cmp_ge_u32 s5, s6
+; GFX10-NEXT: s_cselect_b32 s5, s7, s5
+; GFX10-NEXT: s_sub_i32 s7, s5, s6
+; GFX10-NEXT: s_cmp_ge_u32 s5, s6
+; GFX10-NEXT: s_cselect_b32 s5, s7, s5
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
-; GFX10-NEXT: s_mul_i32 s10, s6, s5
-; GFX10-NEXT: s_mul_i32 s9, s6, s9
-; GFX10-NEXT: s_sub_i32 s10, s7, s10
-; GFX10-NEXT: s_add_i32 s9, s7, s9
-; GFX10-NEXT: s_cmp_ge_u32 s10, s6
-; GFX10-NEXT: s_cselect_b32 s9, s9, s10
-; GFX10-NEXT: s_sub_i32 s10, s9, s6
-; GFX10-NEXT: s_cmp_ge_u32 s9, s6
-; GFX10-NEXT: s_cselect_b32 s9, s10, s9
-; GFX10-NEXT: s_add_u32 s10, s0, s2
-; GFX10-NEXT: s_addc_u32 s11, s1, s3
-; GFX10-NEXT: s_add_i32 s7, s7, 1
-; GFX10-NEXT: s_add_u32 s4, s4, s8
-; GFX10-NEXT: v_mov_b32_e32 v1, s9
-; GFX10-NEXT: s_addc_u32 s5, s5, 0
-; GFX10-NEXT: s_add_u32 s2, s2, 4
-; GFX10-NEXT: s_addc_u32 s3, s3, 0
-; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000
-; GFX10-NEXT: global_store_dword v0, v1, s[10:11]
+; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], 2
+; GFX10-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-NEXT: s_add_u32 s8, s0, s8
+; GFX10-NEXT: s_addc_u32 s9, s1, s9
+; GFX10-NEXT: s_add_i32 s2, s2, 1
+; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x400
+; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
; GFX10-NEXT: s_cbranch_scc0 .LBB1_1
; GFX10-NEXT: ; %bb.2: ; %bb2
; GFX10-NEXT: s_endpgm
@@ -262,48 +239,45 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: s_mov_b32 s7, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX11-NEXT: s_sub_i32 s2, 0, s6
+; GFX11-NEXT: s_sub_i32 s3, 0, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s2, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_mul_i32 s2, s2, s4
-; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2
-; GFX11-NEXT: s_mov_b64 s[2:3], 0
-; GFX11-NEXT: s_add_i32 s8, s4, s5
-; GFX11-NEXT: s_mov_b64 s[4:5], 0
+; GFX11-NEXT: s_mul_i32 s3, s3, s2
+; GFX11-NEXT: s_mul_hi_u32 s4, s2, s3
+; GFX11-NEXT: s_mov_b32 s3, 0
+; GFX11-NEXT: s_add_i32 s4, s2, s4
+; GFX11-NEXT: s_mov_b32 s2, s3
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB1_1: ; %bb3
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_mul_hi_u32 s5, s2, s4
+; GFX11-NEXT: s_mul_i32 s5, s5, s6
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_sub_i32 s5, s2, s5
+; GFX11-NEXT: s_sub_i32 s7, s5, s6
+; GFX11-NEXT: s_cmp_ge_u32 s5, s6
+; GFX11-NEXT: s_cselect_b32 s5, s7, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_not_b32 s9, s5
-; GFX11-NEXT: s_mul_i32 s10, s6, s5
-; GFX11-NEXT: s_mul_i32 s9, s6, s9
-; GFX11-NEXT: s_sub_i32 s10, s7, s10
-; GFX11-NEXT: s_add_i32 s9, s7, s9
-; GFX11-NEXT: s_cmp_ge_u32 s10, s6
-; GFX11-NEXT: s_cselect_b32 s9, s9, s10
+; GFX11-NEXT: s_sub_i32 s7, s5, s6
+; GFX11-NEXT: s_cmp_ge_u32 s5, s6
+; GFX11-NEXT: s_cselect_b32 s5, s7, s5
+; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], 2
+; GFX11-NEXT: v_mov_b32_e32 v1, s5
+; GFX11-NEXT: s_add_u32 s8, s0, s8
+; GFX11-NEXT: s_addc_u32 s9, s1, s9
+; GFX11-NEXT: s_add_i32 s2, s2, 1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_sub_i32 s10, s9, s6
-; GFX11-NEXT: s_cmp_ge_u32 s9, s6
-; GFX11-NEXT: s_cselect_b32 s9, s10, s9
-; GFX11-NEXT: s_add_u32 s10, s0, s2
-; GFX11-NEXT: s_addc_u32 s11, s1, s3
-; GFX11-NEXT: s_add_i32 s7, s7, 1
-; GFX11-NEXT: s_add_u32 s4, s4, s8
-; GFX11-NEXT: v_mov_b32_e32 v1, s9
-; GFX11-NEXT: s_addc_u32 s5, s5, 0
-; GFX11-NEXT: s_add_u32 s2, s2, 4
-; GFX11-NEXT: s_addc_u32 s3, s3, 0
-; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000
-; GFX11-NEXT: global_store_b32 v0, v1, s[10:11]
+; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x400
+; GFX11-NEXT: global_store_b32 v0, v1, s[8:9]
; GFX11-NEXT: s_cbranch_scc0 .LBB1_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/lsr-cost-model-vector-iv.ll b/llvm/test/CodeGen/AMDGPU/lsr-cost-model-vector-iv.ll
new file mode 100644
index 0000000000000..ac8b5ec82cf38
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lsr-cost-model-vector-iv.ll
@@ -0,0 +1,37 @@
+; RUN: llc -O3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
+
+; Reduced from rocrand's threefry2x32_20 kernel.
+; The AMDGPU LSR cost model should avoid creating a redundant VGPR induction
+; variable when the loop already has a vector IV incremented by a uniform
+; (SGPR) stride. Without the cost model fix, LSR introduces a second v_add
+; in the loop body, wasting a VGPR and a VALU slot every iteration.
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+; CHECK-LABEL: {{^}}lsr_vector_iv_cost:
+; The loop must contain exactly one VALU add — the single vector IV update.
+; A second v_add_u32 here would mean LSR created a redundant IV.
+; CHECK: {{^}}.LBB0_1:
+; CHECK: v_add_u32
+; CHECK-NOT: v_add_u32
+; CHECK: s_cbranch
+define amdgpu_kernel void @lsr_vector_iv_cost(<2 x i32> %arg0, i32 %stride) {
+entry:
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ br label %loop
+
+loop:
+ %iv.pn = phi i32 [ 0, %entry ], [ %or, %loop ]
+ %iv.vec = phi i32 [ %tid, %entry ], [ %sum1, %loop ]
+ %sum1 = add i32 %iv.vec, %stride
+ %elt = extractelement <2 x i32> %arg0, i64 0
+ %sum2 = add i32 %sum1, %elt
+ %xor = xor i32 1, %sum2
+ %sum3 = add i32 %sum2, %xor
+ %sum4 = add i32 %sum3, %elt
+ %or = or i32 %sum4, %stride
+ %shr = lshr i32 %iv.pn, 1
+ br label %loop
+}
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
index b36d3212b688a..dfd4870787e62 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
@@ -19,11 +19,10 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
; CHECK-NEXT: .LBB0_1: ; %Flow
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
-; CHECK-NEXT: v_add_nc_u32_e32 v3, -4, v3
; CHECK-NEXT: .LBB0_2: ; %Flow1
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
-; CHECK-NEXT: v_cmp_ne_u32_e64 s5, 0, v1
+; CHECK-NEXT: v_cmp_ne_u32_e64 s5, 0, v3
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; j lastloop entry
; CHECK-NEXT: ;;#ASMEND
@@ -33,40 +32,41 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
; CHECK-NEXT: .LBB0_3: ; %for.body33
; CHECK-NEXT: ; =>This Loop Header: Depth=1
; CHECK-NEXT: ; Child Loop BB0_6 Depth 2
-; CHECK-NEXT: v_mov_b32_e32 v3, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: v_mov_b32_e32 v3, 0
; CHECK-NEXT: s_and_saveexec_b32 s7, s4
; CHECK-NEXT: s_cbranch_execz .LBB0_2
; CHECK-NEXT: ; %bb.4: ; %for.body51.preheader
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT: v_mov_b32_e32 v4, v2
+; CHECK-NEXT: s_mov_b32 s9, 4
; CHECK-NEXT: s_mov_b32 s8, 0
-; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: s_branch .LBB0_6
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB0_5: ; %if.end118
; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: v_add_nc_u32_e32 v4, 4, v1
; CHECK-NEXT: s_add_i32 s9, s9, 4
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; backedge
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_add_nc_u32_e32 v3, s9, v2
-; CHECK-NEXT: v_cmp_ge_u32_e64 s5, v3, v0
+; CHECK-NEXT: v_cmp_ge_u32_e64 s5, v4, v0
; CHECK-NEXT: s_or_b32 s8, s5, s8
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execz .LBB0_1
; CHECK-NEXT: .LBB0_6: ; %for.body51
; CHECK-NEXT: ; Parent Loop BB0_3 Depth=1
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
-; CHECK-NEXT: v_mov_b32_e32 v1, 1
+; CHECK-NEXT: v_mov_b32_e32 v1, v4
+; CHECK-NEXT: v_mov_b32_e32 v3, 1
; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB0_5
; CHECK-NEXT: ; %bb.7: ; %if.then112
; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2
-; CHECK-NEXT: s_add_i32 s10, s9, 4
-; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: v_mov_b32_e32 v3, s10
-; CHECK-NEXT: ds_write_b32 v1, v3
+; CHECK-NEXT: v_mov_b32_e32 v3, 0
+; CHECK-NEXT: v_mov_b32_e32 v4, s9
+; CHECK-NEXT: ds_write_b32 v3, v4
; CHECK-NEXT: s_branch .LBB0_5
; CHECK-NEXT: .LBB0_8: ; %for.body159.preheader
; CHECK-NEXT: s_inst_prefetch 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index 66683d323837d..0e250baa86090 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -62,7 +62,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
-; CHECK-NEXT: v_mov_b32_e32 v45, 0
+; CHECK-NEXT: v_mov_b32_e32 v46, 0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_mov_b32_e32 v43, v0
; CHECK-NEXT: v_mov_b32_e32 v31, v40
@@ -91,7 +91,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_mov_b32 s12, s51
; CHECK-NEXT: s_mov_b32 s13, s50
; CHECK-NEXT: s_mov_b32 s14, s33
-; CHECK-NEXT: ds_write_b32 v45, v45 offset:15360
+; CHECK-NEXT: ds_write_b32 v46, v46 offset:15360
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v43
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v43
@@ -118,69 +118,66 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_cmpx_ne_u32_e32 0, v42
; CHECK-NEXT: s_cbranch_execz .LBB0_25
; CHECK-NEXT: ; %bb.1: ; %.preheader5
-; CHECK-NEXT: v_mul_lo_u32 v0, v41, 14
+; CHECK-NEXT: v_mul_lo_u32 v44, v41, 14
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: s_mov_b32 s5, 0
-; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v0
+; CHECK-NEXT: v_add_nc_u32_e32 v45, 0x3c04, v44
; CHECK-NEXT: .LBB0_2: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: v_add_nc_u32_e32 v1, s5, v44
+; CHECK-NEXT: v_add_nc_u32_e32 v0, s5, v45
; CHECK-NEXT: s_add_i32 s5, s5, 1
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v42
-; CHECK-NEXT: ds_write_b8 v1, v45
+; CHECK-NEXT: ds_write_b8 v0, v46
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB0_2
; CHECK-NEXT: ; %bb.3:
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v42
-; CHECK-NEXT: s_mov_b32 s55, 0
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v45
+; CHECK-NEXT: v_add_nc_u32_e32 v46, -1, v42
+; CHECK-NEXT: s_mov_b32 s53, 0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v46
; CHECK-NEXT: s_and_b32 exec_lo, exec_lo, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB0_25
; CHECK-NEXT: ; %bb.4:
; CHECK-NEXT: v_lshlrev_b32_e32 v43, 10, v43
-; CHECK-NEXT: v_add_nc_u32_e32 v46, 0x3c05, v0
; CHECK-NEXT: v_mov_b32_e32 v47, 0
-; CHECK-NEXT: s_mov_b32 s53, 0
+; CHECK-NEXT: s_mov_b32 s54, 0
; CHECK-NEXT: .LBB0_5: ; =>This Loop Header: Depth=1
; CHECK-NEXT: ; Child Loop BB0_8 Depth 2
; CHECK-NEXT: ; Child Loop BB0_20 Depth 2
-; CHECK-NEXT: v_add_nc_u32_e32 v0, s55, v44
-; CHECK-NEXT: s_lshl_b32 s4, s55, 5
-; CHECK-NEXT: s_add_i32 s54, s55, 1
-; CHECK-NEXT: s_add_i32 s5, s55, 5
-; CHECK-NEXT: v_or3_b32 v57, s4, v43, s54
+; CHECK-NEXT: s_mov_b32 s4, s53
+; CHECK-NEXT: s_lshl_b32 s5, s53, 5
+; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v45
+; CHECK-NEXT: s_add_i32 s53, s53, 1
+; CHECK-NEXT: s_add_i32 s4, s4, 5
+; CHECK-NEXT: v_or3_b32 v57, s5, v43, s53
+; CHECK-NEXT: v_mov_b32_e32 v58, s53
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: ds_read_u8 v56, v0
-; CHECK-NEXT: v_mov_b32_e32 v58, s54
-; CHECK-NEXT: s_mov_b32 s68, exec_lo
-; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v42
+; CHECK-NEXT: s_mov_b32 s55, exec_lo
+; CHECK-NEXT: v_cmpx_lt_u32_e64 s4, v42
; CHECK-NEXT: s_cbranch_execz .LBB0_17
; CHECK-NEXT: ; %bb.6: ; %.preheader2
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT: s_mov_b32 s69, 0
-; CHECK-NEXT: s_mov_b32 s80, 0
+; CHECK-NEXT: s_mov_b32 s68, 0
+; CHECK-NEXT: s_mov_b32 s69, s53
; CHECK-NEXT: s_branch .LBB0_8
; CHECK-NEXT: .LBB0_7: ; in Loop: Header=BB0_8 Depth=2
-; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81
-; CHECK-NEXT: s_add_i32 s80, s80, 4
-; CHECK-NEXT: s_add_i32 s4, s55, s80
-; CHECK-NEXT: v_add_nc_u32_e32 v0, s80, v57
-; CHECK-NEXT: s_add_i32 s5, s4, 5
-; CHECK-NEXT: s_add_i32 s4, s4, 1
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s5, v42
-; CHECK-NEXT: v_mov_b32_e32 v58, s4
-; CHECK-NEXT: s_or_b32 s69, vcc_lo, s69
-; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s69
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s80
+; CHECK-NEXT: s_add_i32 s4, s69, 4
+; CHECK-NEXT: v_add_nc_u32_e32 v57, 4, v57
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s4, v42
+; CHECK-NEXT: v_mov_b32_e32 v58, s69
+; CHECK-NEXT: s_or_b32 s68, vcc_lo, s68
+; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s68
; CHECK-NEXT: s_cbranch_execz .LBB0_16
; CHECK-NEXT: .LBB0_8: ; Parent Loop BB0_5 Depth=1
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
-; CHECK-NEXT: v_add_nc_u32_e32 v59, s80, v46
-; CHECK-NEXT: v_add_nc_u32_e32 v58, s80, v57
-; CHECK-NEXT: ds_read_u8 v0, v59
+; CHECK-NEXT: v_add_nc_u32_e32 v58, s69, v45
+; CHECK-NEXT: s_mov_b32 s69, s4
+; CHECK-NEXT: ds_read_u8 v0, v58
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NEXT: s_and_saveexec_b32 s81, s4
+; CHECK-NEXT: v_cmp_eq_u16_sdwa s5, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NEXT: s_and_saveexec_b32 s80, s5
; CHECK-NEXT: s_cbranch_execz .LBB0_10
; CHECK-NEXT: ; %bb.9: ; in Loop: Header=BB0_8 Depth=2
; CHECK-NEXT: v_mov_b32_e32 v31, v40
@@ -199,13 +196,13 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; CHECK-NEXT: ds_write_b32 v0, v58
+; CHECK-NEXT: ds_write_b32 v0, v57
; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_8 Depth=2
-; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81
-; CHECK-NEXT: ds_read_u8 v0, v59 offset:1
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s80
+; CHECK-NEXT: ds_read_u8 v0, v58 offset:1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NEXT: s_and_saveexec_b32 s81, s4
+; CHECK-NEXT: s_and_saveexec_b32 s80, s4
; CHECK-NEXT: s_cbranch_execz .LBB0_12
; CHECK-NEXT: ; %bb.11: ; in Loop: Header=BB0_8 Depth=2
; CHECK-NEXT: v_mov_b32_e32 v31, v40
@@ -221,17 +218,17 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_mov_b32 s12, s51
; CHECK-NEXT: s_mov_b32 s13, s50
; CHECK-NEXT: s_mov_b32 s14, s33
-; CHECK-NEXT: v_add_nc_u32_e32 v60, 1, v58
+; CHECK-NEXT: v_add_nc_u32_e32 v59, 1, v57
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; CHECK-NEXT: ds_write_b32 v0, v60
+; CHECK-NEXT: ds_write_b32 v0, v59
; CHECK-NEXT: .LBB0_12: ; in Loop: Header=BB0_8 Depth=2
-; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81
-; CHECK-NEXT: ds_read_u8 v0, v59 offset:2
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s80
+; CHECK-NEXT: ds_read_u8 v0, v58 offset:2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NEXT: s_and_saveexec_b32 s81, s4
+; CHECK-NEXT: s_and_saveexec_b32 s80, s4
; CHECK-NEXT: s_cbranch_execz .LBB0_14
; CHECK-NEXT: ; %bb.13: ; in Loop: Header=BB0_8 Depth=2
; CHECK-NEXT: v_mov_b32_e32 v31, v40
@@ -247,17 +244,17 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_mov_b32 s12, s51
; CHECK-NEXT: s_mov_b32 s13, s50
; CHECK-NEXT: s_mov_b32 s14, s33
-; CHECK-NEXT: v_add_nc_u32_e32 v60, 2, v58
+; CHECK-NEXT: v_add_nc_u32_e32 v59, 2, v57
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; CHECK-NEXT: ds_write_b32 v0, v60
+; CHECK-NEXT: ds_write_b32 v0, v59
; CHECK-NEXT: .LBB0_14: ; in Loop: Header=BB0_8 Depth=2
-; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81
-; CHECK-NEXT: ds_read_u8 v0, v59 offset:3
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s80
+; CHECK-NEXT: ds_read_u8 v0, v58 offset:3
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NEXT: s_and_saveexec_b32 s81, s4
+; CHECK-NEXT: s_and_saveexec_b32 s80, s4
; CHECK-NEXT: s_cbranch_execz .LBB0_7
; CHECK-NEXT: ; %bb.15: ; in Loop: Header=BB0_8 Depth=2
; CHECK-NEXT: v_mov_b32_e32 v31, v40
@@ -273,19 +270,18 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_mov_b32 s12, s51
; CHECK-NEXT: s_mov_b32 s13, s50
; CHECK-NEXT: s_mov_b32 s14, s33
-; CHECK-NEXT: v_add_nc_u32_e32 v58, 3, v58
+; CHECK-NEXT: v_add_nc_u32_e32 v58, 3, v57
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v58
; CHECK-NEXT: s_branch .LBB0_7
-; CHECK-NEXT: .LBB0_16: ; %Flow43
-; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s69
-; CHECK-NEXT: v_mov_b32_e32 v57, v0
-; CHECK-NEXT: .LBB0_17: ; %Flow44
+; CHECK-NEXT: .LBB0_16: ; %Flow32
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s68
+; CHECK-NEXT: .LBB0_17: ; %Flow33
+; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
; CHECK-NEXT: s_mov_b32 s55, exec_lo
; CHECK-NEXT: v_cmpx_lt_u32_e64 v58, v42
; CHECK-NEXT: s_cbranch_execz .LBB0_23
@@ -306,7 +302,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: .LBB0_20: ; Parent Loop BB0_5 Depth=1
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v58
-; CHECK-NEXT: ds_read_u8 v0, v0
+; CHECK-NEXT: ds_read_u8 v0, v0 offset:15364
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
; CHECK-NEXT: s_and_saveexec_b32 s69, s4
@@ -330,24 +326,22 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v57
; CHECK-NEXT: s_branch .LBB0_19
-; CHECK-NEXT: .LBB0_22: ; %Flow41
+; CHECK-NEXT: .LBB0_22: ; %Flow30
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_inst_prefetch 0x2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s68
-; CHECK-NEXT: .LBB0_23: ; %Flow42
+; CHECK-NEXT: .LBB0_23: ; %Flow31
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s54, v45
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s53, v46
; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v47
-; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46
-; CHECK-NEXT: s_mov_b32 s55, s54
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_and_b32 s4, exec_lo, s4
-; CHECK-NEXT: s_or_b32 s53, s4, s53
-; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s53
+; CHECK-NEXT: s_or_b32 s54, s4, s54
+; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s54
; CHECK-NEXT: s_cbranch_execnz .LBB0_5
-; CHECK-NEXT: .LBB0_25: ; %Flow49
+; CHECK-NEXT: .LBB0_25: ; %Flow38
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 1
@@ -828,7 +822,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: s_mov_b32 s13, s50
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT: v_mul_lo_u32 v46, v0, 14
+; CHECK-NEXT: v_mul_lo_u32 v44, v0, 14
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 1
; CHECK-NEXT: s_getpc_b64 s[16:17]
@@ -842,7 +836,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: s_mov_b32 s13, s50
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: ds_write_b32 v43, v43 offset:15360
-; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v46
+; CHECK-NEXT: v_add_nc_u32_e32 v45, 0x3c04, v44
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v42
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v42
@@ -866,52 +860,49 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_mov_b32_e32 v41, v0
; CHECK-NEXT: v_lshlrev_b32_e32 v42, 10, v42
-; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_mov_b32 s53, 0
; CHECK-NEXT: s_mov_b32 s52, 0
-; CHECK-NEXT: ds_write_b8 v46, v43 offset:15364
-; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v41
+; CHECK-NEXT: ds_write_b8 v44, v43 offset:15364
+; CHECK-NEXT: v_add_nc_u32_e32 v46, -1, v41
; CHECK-NEXT: .LBB1_1: ; %.37
; CHECK-NEXT: ; =>This Loop Header: Depth=1
; CHECK-NEXT: ; Child Loop BB1_3 Depth 2
; CHECK-NEXT: ; Child Loop BB1_8 Depth 2
-; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v44
-; CHECK-NEXT: s_lshl_b32 s5, s4, 5
-; CHECK-NEXT: s_add_i32 s53, s4, 1
-; CHECK-NEXT: s_add_i32 s6, s4, 5
-; CHECK-NEXT: v_or3_b32 v47, s5, v42, s53
+; CHECK-NEXT: s_mov_b32 s4, s53
+; CHECK-NEXT: s_lshl_b32 s6, s53, 5
+; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v45
+; CHECK-NEXT: s_add_i32 s53, s53, 1
+; CHECK-NEXT: s_add_i32 s5, s4, 5
+; CHECK-NEXT: v_or3_b32 v56, s6, v42, s53
+; CHECK-NEXT: v_mov_b32_e32 v57, s53
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: ds_read_u8 v46, v0
-; CHECK-NEXT: v_mov_b32_e32 v56, s53
-; CHECK-NEXT: s_mov_b32 s5, exec_lo
-; CHECK-NEXT: v_cmpx_lt_u32_e64 s6, v41
+; CHECK-NEXT: ds_read_u8 v47, v0
+; CHECK-NEXT: s_mov_b32 s4, exec_lo
+; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v41
; CHECK-NEXT: s_cbranch_execz .LBB1_5
; CHECK-NEXT: ; %bb.2: ; %.53.preheader
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: s_mov_b32 s6, 0
-; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .LBB1_3: ; %.53
; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
-; CHECK-NEXT: s_add_i32 s7, s7, 4
+; CHECK-NEXT: s_add_i32 s7, s5, 4
; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43
-; CHECK-NEXT: s_add_i32 s8, s4, s7
-; CHECK-NEXT: v_add_nc_u32_e32 v0, s7, v47
-; CHECK-NEXT: s_add_i32 s9, s8, 5
-; CHECK-NEXT: s_add_i32 s8, s8, 1
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s9, v41
-; CHECK-NEXT: v_mov_b32_e32 v56, s8
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s7, v41
+; CHECK-NEXT: v_add_nc_u32_e32 v56, 4, v56
+; CHECK-NEXT: v_mov_b32_e32 v57, s5
+; CHECK-NEXT: s_mov_b32 s5, s7
; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_cbranch_execnz .LBB1_3
-; CHECK-NEXT: ; %bb.4: ; %Flow3
+; CHECK-NEXT: ; %bb.4: ; %Flow4
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; CHECK-NEXT: v_mov_b32_e32 v47, v0
-; CHECK-NEXT: .LBB1_5: ; %Flow4
+; CHECK-NEXT: .LBB1_5: ; %Flow5
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
-; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_mov_b32 s54, exec_lo
-; CHECK-NEXT: v_cmpx_lt_u32_e64 v56, v41
+; CHECK-NEXT: v_cmpx_lt_u32_e64 v57, v41
; CHECK-NEXT: s_cbranch_execz .LBB1_11
; CHECK-NEXT: ; %bb.6: ; %.103.preheader
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
@@ -922,19 +913,19 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: .LBB1_7: ; %.114
; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s64
+; CHECK-NEXT: v_add_nc_u32_e32 v57, 1, v57
; CHECK-NEXT: v_add_nc_u32_e32 v56, 1, v56
-; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v56, v41
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v57, v41
; CHECK-NEXT: s_or_b32 s55, vcc_lo, s55
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s55
; CHECK-NEXT: s_cbranch_execz .LBB1_10
; CHECK-NEXT: .LBB1_8: ; %.103
; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
-; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v56
-; CHECK-NEXT: ds_read_u8 v0, v0
+; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v57
+; CHECK-NEXT: ds_read_u8 v0, v0 offset:15364
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v47, v0 src0_sel:BYTE_0 src1_sel:DWORD
; CHECK-NEXT: s_and_saveexec_b32 s64, s4
; CHECK-NEXT: s_cbranch_execz .LBB1_7
; CHECK-NEXT: ; %bb.9: ; %.110
@@ -955,23 +946,22 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; CHECK-NEXT: ds_write_b32 v0, v47
+; CHECK-NEXT: ds_write_b32 v0, v56
; CHECK-NEXT: s_branch .LBB1_7
; CHECK-NEXT: .LBB1_10: ; %Flow
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: s_inst_prefetch 0x2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
-; CHECK-NEXT: .LBB1_11: ; %Flow2
+; CHECK-NEXT: .LBB1_11: ; %Flow3
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s54
; CHECK-NEXT: ; %bb.12: ; %.32
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s53, v45
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s53, v46
; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v43
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_and_b32 s4, exec_lo, s4
; CHECK-NEXT: s_or_b32 s52, s4, s52
-; CHECK-NEXT: s_mov_b32 s4, s53
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s52
; CHECK-NEXT: s_cbranch_execnz .LBB1_1
; CHECK-NEXT: ; %bb.13: ; %.119
diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
index c60642e2cc4d8..376def942343a 100644
--- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
@@ -5405,96 +5405,92 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; CHECK-NEXT: s_xor_b32 s6, exec_lo, s4
; CHECK-NEXT: s_cbranch_execz .LBB5_3
; CHECK-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader
-; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: s_mov_b64 s[4:5], 0x800
; CHECK-NEXT: .LBB5_2: ; %memmove_fwd_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: v_add_co_u32 v52, vcc_lo, v2, s4
-; CHECK-NEXT: v_add_co_ci_u32_e64 v53, null, s5, v3, vcc_lo
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[52:53] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[52:53]
-; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v52, 48
-; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v53, vcc_lo
-; CHECK-NEXT: v_add_co_u32 v80, vcc_lo, v52, 0x60
-; CHECK-NEXT: v_add_co_ci_u32_e64 v81, null, 0, v53, vcc_lo
-; CHECK-NEXT: v_add_co_u32 v84, vcc_lo, 0x50, v52
-; CHECK-NEXT: v_add_co_ci_u32_e64 v85, null, 0, v53, vcc_lo
-; CHECK-NEXT: s_clause 0xd
-; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[52:53] offset:32
-; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[52:53] offset:64
-; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[80:81] offset:128
-; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[80:81] offset:144
-; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[80:81] offset:96
-; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[80:81] offset:112
-; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[80:81] offset:64
-; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[80:81] offset:80
-; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[52:53] offset:128
-; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[80:81] offset:48
-; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[80:81]
+; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, 48
+; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v3, vcc_lo
+; CHECK-NEXT: v_add_co_u32 v84, vcc_lo, 0x50, v2
+; CHECK-NEXT: v_add_co_ci_u32_e64 v85, null, 0, v3, vcc_lo
+; CHECK-NEXT: v_add_co_u32 v80, vcc_lo, v2, 0x60
+; CHECK-NEXT: v_add_co_ci_u32_e64 v81, null, 0, v3, vcc_lo
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:32
+; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] offset:64
+; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[2:3] offset:16
+; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[2:3]
+; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[80:81] offset:144
+; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[80:81] offset:128
+; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[80:81] offset:112
+; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[80:81] offset:96
+; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[80:81] offset:80
+; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[80:81] offset:64
+; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[80:81]
+; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[2:3] offset:128
+; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[80:81] offset:48
; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[80:81] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[84:85]
; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97]
-; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4
-; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo
-; CHECK-NEXT: s_add_u32 s4, s4, 0x100
-; CHECK-NEXT: s_addc_u32 s5, s5, 0
-; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48
-; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800
-; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo
-; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:64
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:32
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11]
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[20:23] offset:176
-; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:192
-; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[28:31] offset:144
-; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[32:35] offset:160
-; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[36:39] offset:112
-; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:128
-; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:128
-; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:96
-; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:48
+; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, 48
+; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, 0, v1, vcc_lo
+; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, 0x100, v2
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00
+; CHECK-NEXT: s_addc_u32 s5, s5, -1
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:32
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(14)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[12:15] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(14)
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:176
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:192
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(14)
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:144
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:160
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(14)
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:112
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:128
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(14)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[64:67] offset:128
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(14)
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:96
+; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0x100, v0
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:48
; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:64
; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[84:87] offset:32
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[96:99]
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99]
; CHECK-NEXT: s_cbranch_scc1 .LBB5_2
-; CHECK-NEXT: .LBB5_3: ; %Flow5
-; CHECK-NEXT: s_andn2_saveexec_b32 s8, s6
+; CHECK-NEXT: .LBB5_3: ; %Flow15
+; CHECK-NEXT: s_andn2_saveexec_b32 s6, s6
; CHECK-NEXT: s_cbranch_execz .LBB5_6
; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader
-; CHECK-NEXT: s_movk_i32 s6, 0xff00
-; CHECK-NEXT: s_mov_b64 s[4:5], 0x700
-; CHECK-NEXT: s_mov_b32 s7, -1
+; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, 0x700, v2
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0x700, v0
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; CHECK-NEXT: s_movk_i32 s4, 0xf800
+; CHECK-NEXT: s_mov_b32 s5, -1
; CHECK-NEXT: .LBB5_5: ; %memmove_bwd_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4
-; CHECK-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo
-; CHECK-NEXT: s_clause 0x4
-; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[24:25] offset:64
-; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:32
-; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[24:25]
-; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:128
-; CHECK-NEXT: v_add_co_u32 v80, vcc_lo, v24, 0x60
-; CHECK-NEXT: v_add_co_ci_u32_e64 v81, null, 0, v25, vcc_lo
-; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v24, 48
-; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v25, vcc_lo
-; CHECK-NEXT: v_add_co_u32 v84, vcc_lo, 0x50, v24
-; CHECK-NEXT: v_add_co_ci_u32_e64 v85, null, 0, v25, vcc_lo
-; CHECK-NEXT: s_clause 0xa
+; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, 48
+; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v3, vcc_lo
+; CHECK-NEXT: v_add_co_u32 v84, vcc_lo, 0x50, v2
+; CHECK-NEXT: v_add_co_ci_u32_e64 v85, null, 0, v3, vcc_lo
+; CHECK-NEXT: v_add_co_u32 v80, vcc_lo, v2, 0x60
+; CHECK-NEXT: v_add_co_ci_u32_e64 v81, null, 0, v3, vcc_lo
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:128
+; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] offset:64
+; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[2:3] offset:32
+; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[2:3] offset:16
+; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[2:3]
; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[80:81] offset:128
; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[80:81] offset:144
; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[80:81] offset:96
@@ -5506,421 +5502,421 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[80:81] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[84:85]
; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97]
-; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4
-; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo
-; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00
-; CHECK-NEXT: s_addc_u32 s5, s5, -1
-; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48
-; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7]
-; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo
+; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, 48
+; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, 0, v1, vcc_lo
+; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff00, v2
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
+; CHECK-NEXT: s_add_u32 s4, s4, 0x100
+; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:128
; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:32
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:64
; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15]
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[12:15] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:16
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[16:19] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:128
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
+; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffff00, v0
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 0
; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:176
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:176
; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[28:31] offset:192
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:192
; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[32:35] offset:144
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:144
; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[36:39] offset:160
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:160
; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:112
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:128
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:128
; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:96
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:96
; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:48
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:48
; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:64
; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[84:87] offset:32
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[96:99]
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99]
; CHECK-NEXT: s_cbranch_scc0 .LBB5_5
-; CHECK-NEXT: .LBB5_6: ; %Flow6
-; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT: .LBB5_6: ; %Flow16
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
;
; ALIGNED-LABEL: memmove_p0_p0_sz2048:
; ALIGNED: ; %bb.0: ; %entry
; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v72, off, s[0:3], s32 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_mov_b32 s4, exec_lo
; ALIGNED-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1]
; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s4
; ALIGNED-NEXT: s_cbranch_execz .LBB5_3
; ALIGNED-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader
-; ALIGNED-NEXT: s_mov_b64 s[4:5], 0
+; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x800
; ALIGNED-NEXT: .LBB5_2: ; %memmove_fwd_loop
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
-; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v6, vcc_lo, v4, 48
-; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[4:5] offset:128
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v5, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v8, vcc_lo, 0x50, v4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v5, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v12, vcc_lo, 0x60, v4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v5, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, 0x70, v4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v5, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, 0x90, v4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v5, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v38, vcc_lo, 0xa0, v4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v39, null, 0, v5, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v52, vcc_lo, 0xb0, v4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v53, null, 0, v5, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v54, vcc_lo, 0xc0, v4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v55, null, 0, v5, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v64, vcc_lo, 0xd0, v4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v65, null, 0, v5, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v70, vcc_lo, 0xe0, v4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v71, null, 0, v5, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v80, vcc_lo, 0xf0, v4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v81, null, 0, v5, vcc_lo
-; ALIGNED-NEXT: s_clause 0xe
-; ALIGNED-NEXT: flat_load_dwordx4 v[34:37], v[4:5]
-; ALIGNED-NEXT: flat_load_dwordx4 v[30:33], v[4:5] offset:16
-; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[4:5] offset:32
-; ALIGNED-NEXT: flat_load_dwordx4 v[48:51], v[4:5] offset:64
-; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[6:7]
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[2:3] offset:128
+; ALIGNED-NEXT: flat_load_dwordx4 v[24:27], v[2:3] offset:64
+; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, 48
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v3, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v8, vcc_lo, 0x50, v2
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v3, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v12, vcc_lo, 0x60, v2
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v3, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, 0x70, v2
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v3, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v36, vcc_lo, 0x90, v2
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v37, null, 0, v3, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v38, vcc_lo, 0xa0, v2
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v39, null, 0, v3, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v48, vcc_lo, 0xb0, v2
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v49, null, 0, v3, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v50, vcc_lo, 0xc0, v2
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v51, null, 0, v3, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v52, vcc_lo, 0xd0, v2
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v53, null, 0, v3, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, 0xe0, v2
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v3, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v98, vcc_lo, 0xf0, v2
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v99, null, 0, v3, vcc_lo
+; ALIGNED-NEXT: s_clause 0xd
+; ALIGNED-NEXT: flat_load_dwordx4 v[28:31], v[2:3] offset:16
+; ALIGNED-NEXT: flat_load_dwordx4 v[32:35], v[2:3] offset:32
+; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[16:17]
-; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[24:25]
-; ALIGNED-NEXT: flat_load_dwordx4 v[82:85], v[38:39]
-; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[52:53]
-; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[54:55]
-; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[64:65]
-; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[70:71]
-; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[80:81]
-; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v0, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v1, vcc_lo
-; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100
-; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
+; ALIGNED-NEXT: flat_load_dwordx4 v[84:87], v[36:37]
+; ALIGNED-NEXT: flat_load_dwordx4 v[80:83], v[38:39]
+; ALIGNED-NEXT: flat_load_dwordx4 v[68:71], v[48:49]
+; ALIGNED-NEXT: flat_load_dwordx4 v[64:67], v[50:51]
+; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[52:53]
+; ALIGNED-NEXT: flat_load_dwordx4 v[48:51], v[96:97]
+; ALIGNED-NEXT: flat_load_dwordx4 v[36:39], v[98:99]
+; ALIGNED-NEXT: flat_load_dwordx4 v[100:103], v[2:3]
+; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00
+; ALIGNED-NEXT: s_addc_u32 s5, s5, -1
; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
-; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160
-; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:164
-; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168
-; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:172
+; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:128
+; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:132
+; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:136
+; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:140
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v102, off, s[0:3], s32 offset:172
-; ALIGNED-NEXT: buffer_load_dword v103, off, s[0:3], s32 offset:168
-; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164
-; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160
-; ALIGNED-NEXT: v_add_co_u32 v22, vcc_lo, v24, 6
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v25, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v24, 3
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v25, vcc_lo
-; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800
+; ALIGNED-NEXT: buffer_load_dword v112, off, s[0:3], s32 offset:140
+; ALIGNED-NEXT: buffer_load_dword v113, off, s[0:3], s32 offset:136
+; ALIGNED-NEXT: buffer_load_dword v114, off, s[0:3], s32 offset:132
+; ALIGNED-NEXT: buffer_load_dword v115, off, s[0:3], s32 offset:128
+; ALIGNED-NEXT: v_add_co_u32 v22, vcc_lo, v0, 6
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v1, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v0, 3
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v1, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, 0x100, v2
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v102 offset:136
-; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:137
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v112 offset:136
+; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:137
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v103 offset:132
-; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:133
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v113 offset:132
+; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:133
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v56 offset:128
-; ALIGNED-NEXT: flat_store_byte v[20:21], v56 offset:129
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v57 offset:124
-; ALIGNED-NEXT: flat_store_byte v[24:25], v57 offset:128
-; ALIGNED-NEXT: s_waitcnt lgkmcnt(19)
-; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:224
-; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:228
-; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:232
-; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:236
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v114 offset:128
+; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:129
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v115 offset:124
+; ALIGNED-NEXT: flat_store_byte v[0:1], v115 offset:128
+; ALIGNED-NEXT: s_waitcnt lgkmcnt(22)
+; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:196
+; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:204
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:236
-; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:232
-; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:228
-; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:224
+; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:204
+; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:196
+; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:192
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v101 offset:72
-; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:73
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v99 offset:72
+; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:73
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v100 offset:68
-; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:69
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v98 offset:68
+; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:69
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v99 offset:64
-; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:65
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v97 offset:64
+; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:65
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v98 offset:60
-; ALIGNED-NEXT: flat_store_byte v[24:25], v98 offset:64
-; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:320
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:324
-; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:328
-; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:332
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v96 offset:60
+; ALIGNED-NEXT: flat_store_byte v[0:1], v96 offset:64
+; ALIGNED-NEXT: s_waitcnt lgkmcnt(28)
+; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288
+; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292
+; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296
+; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:300
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:328
-; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:332
-; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:320
-; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:324
+; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:296
+; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:300
+; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:288
+; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:292
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v29 offset:36
-; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:37
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v27 offset:36
+; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:37
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v28 offset:40
-; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:41
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v26 offset:40
+; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:41
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v27 offset:28
-; ALIGNED-NEXT: flat_store_byte v[24:25], v27 offset:32
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v26 offset:32
-; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:33
-; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:272
-; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276
-; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280
-; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v25 offset:28
+; ALIGNED-NEXT: flat_store_byte v[0:1], v25 offset:32
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v24 offset:32
+; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:33
+; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:240
+; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:244
+; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:284
-; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:280
-; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276
-; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272
+; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252
+; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:244
+; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v33 offset:24
-; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:25
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v31 offset:24
+; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:25
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v32 offset:20
-; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:21
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v30 offset:20
+; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:21
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v31 offset:16
-; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:17
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v30 offset:12
-; ALIGNED-NEXT: flat_store_byte v[24:25], v30 offset:16
-; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:288
-; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:292
-; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:296
-; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:300
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v29 offset:16
+; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:17
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v28 offset:12
+; ALIGNED-NEXT: flat_store_byte v[0:1], v28 offset:16
+; ALIGNED-NEXT: s_waitcnt lgkmcnt(32)
+; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:256
+; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:260
+; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:264
+; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:268
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:300
-; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:296
-; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:292
-; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:288
+; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:268
+; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:264
+; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:260
+; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v112
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v112
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v113
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v113
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v114
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v114
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v115
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v115
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v37 offset:8
-; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:9
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v35 offset:8
+; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:9
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v36 offset:4
-; ALIGNED-NEXT: flat_store_byte v[24:25], v36 offset:8
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v34 offset:4
+; ALIGNED-NEXT: flat_store_byte v[0:1], v34 offset:8
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v35
-; ALIGNED-NEXT: flat_store_byte v[24:25], v35 offset:4
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[24:25], v34 offset:2
-; ALIGNED-NEXT: flat_store_byte v[24:25], v34
-; ALIGNED-NEXT: s_waitcnt lgkmcnt(40)
-; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112
-; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116
-; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120
-; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:124
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v33
+; ALIGNED-NEXT: flat_store_byte v[0:1], v33 offset:4
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v32 offset:2
+; ALIGNED-NEXT: flat_store_byte v[0:1], v32
+; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:80
+; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:84
+; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:88
+; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:124
-; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:120
-; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:116
-; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:112
+; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:92
+; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:88
+; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84
+; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v49 offset:248
-; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:249
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v39 offset:248
+; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:249
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v48 offset:244
-; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:245
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v38 offset:244
+; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:245
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v39 offset:240
-; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:241
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v38 offset:236
-; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:237
-; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:128
-; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132
-; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136
-; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:140
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v37 offset:240
+; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:241
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v36 offset:236
+; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:237
+; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:96
+; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:100
+; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:104
+; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:108
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:140
-; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:136
-; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:132
-; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:128
+; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108
+; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:104
+; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:100
+; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:96
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v53 offset:232
-; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:233
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v51 offset:232
+; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:233
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v52 offset:228
-; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:229
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v50 offset:228
+; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:229
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v51 offset:224
-; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:225
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v50 offset:220
-; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:221
-; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80
-; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84
-; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88
-; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v49 offset:224
+; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:225
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v48 offset:220
+; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:221
+; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:48
+; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:52
+; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:56
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:60
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:92
-; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:88
-; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84
-; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:80
+; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60
+; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56
+; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52
+; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v65 offset:216
-; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:217
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v55 offset:216
+; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:217
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v64 offset:212
-; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:213
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v54 offset:212
+; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:213
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v55 offset:208
-; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:209
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v54 offset:204
-; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:205
-; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:96
-; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:100
-; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:104
-; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:108
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v53 offset:208
+; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:209
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v52 offset:204
+; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:205
+; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:64
+; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:68
+; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:72
+; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:76
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:108
-; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:104
-; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:100
-; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:96
+; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:76
+; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:72
+; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:68
+; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:64
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v69 offset:200
-; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:201
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v67 offset:200
+; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:201
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v68 offset:196
-; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:197
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v66 offset:196
+; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:197
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v67 offset:192
-; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:193
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v65 offset:192
+; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:193
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v66 offset:188
-; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:189
-; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:176
-; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:180
-; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:184
-; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:188
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v64 offset:188
+; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:189
+; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:144
+; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:148
+; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:152
+; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:156
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:188
-; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:184
-; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:180
-; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:176
+; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:156
+; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:152
+; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:148
+; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:144
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v81 offset:184
-; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:185
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v71 offset:184
+; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:185
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v80 offset:180
-; ALIGNED-NEXT: flat_store_byte v[20:21], v80 offset:181
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v70 offset:180
+; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:181
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v71 offset:176
-; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:177
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v70 offset:172
-; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:173
-; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:192
-; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:196
-; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:200
-; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:204
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v69 offset:176
+; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:177
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v68 offset:172
+; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:173
+; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:160
+; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:164
+; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:168
+; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:172
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:204
-; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:200
-; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:196
-; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:172
+; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:168
+; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:164
+; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:160
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v85 offset:168
-; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:169
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v83 offset:168
+; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:169
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v84 offset:164
-; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:165
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v82 offset:164
+; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:165
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v83 offset:160
-; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:161
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v82 offset:156
-; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:157
-; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:144
-; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:148
-; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:152
-; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:156
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v81 offset:160
+; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:161
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v80 offset:156
+; ALIGNED-NEXT: flat_store_byte v[20:21], v80 offset:157
+; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:112
+; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:116
+; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:120
+; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:124
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:156
-; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:152
-; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:148
-; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:144
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v102
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v102
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v103
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v103
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v56
-; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:140
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v56
-; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:138
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v57
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v57
-; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:136
+; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:124
+; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:120
+; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:116
+; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:112
+; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:140
+; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:138
+; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:136
; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:134
; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:132
-; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:130
-; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:128
+; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:130
+; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:128
; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:126
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v97 offset:152
-; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:153
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v87 offset:152
+; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:153
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v96 offset:148
-; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:149
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v86 offset:148
+; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:149
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v87 offset:144
-; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:145
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v86 offset:140
-; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:141
-; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240
-; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244
-; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248
-; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v85 offset:144
+; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:145
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v84 offset:140
+; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:141
+; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:212
+; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:216
+; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:220
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252
-; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248
-; ALIGNED-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244
-; ALIGNED-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v101
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v100
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v99
+; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220
+; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:216
+; ALIGNED-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:212
+; ALIGNED-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v99
; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v98
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v98
; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v34
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v37
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v34
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v97
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v96
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v32
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v35
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v34
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v33
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v19 offset:120
; ALIGNED-NEXT: flat_store_byte v[20:21], v19 offset:121
@@ -5933,15 +5929,15 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v16 offset:108
; ALIGNED-NEXT: flat_store_byte v[20:21], v16 offset:109
-; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256
-; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:260
-; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:264
-; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268
+; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:224
+; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228
+; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232
+; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268
-; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264
-; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260
-; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256
+; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:236
+; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232
+; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:228
+; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:224
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v15 offset:104
; ALIGNED-NEXT: flat_store_byte v[20:21], v15 offset:105
@@ -5954,23 +5950,23 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v12 offset:92
; ALIGNED-NEXT: flat_store_byte v[20:21], v12 offset:93
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208
-; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:212
-; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:216
-; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:220
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176
+; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:180
+; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184
+; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:188
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220
-; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216
-; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212
-; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208
-; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:76
-; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:74
-; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:72
-; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:70
-; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:68
-; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:66
-; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:64
-; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:62
+; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188
+; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:184
+; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180
+; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:176
+; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:76
+; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:74
+; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:72
+; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:70
+; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:68
+; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:66
+; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:64
+; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:62
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v11 offset:88
; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:89
@@ -5983,218 +5979,216 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v8 offset:76
; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:77
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316
-; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:312
-; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:308
-; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v26
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v29
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v28
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v27
+; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284
+; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280
+; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v24
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v27
; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v26
; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v33
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33
-; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:36
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v50
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v32
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v31
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v25
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v31
+; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:36
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v49
; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v30
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30
-; ALIGNED-NEXT: flat_store_byte v[24:25], v114 offset:1
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v36
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v36
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v35
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v49
; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49
-; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:40
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v48
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v30
+; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:34
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v48
; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48
-; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:38
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v39
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v29
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v28
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28
+; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:40
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v39
; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39
-; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:44
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v38
+; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:38
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v38
; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38
-; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:42
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v53
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53
-; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:32
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v52
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52
-; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:30
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v51
+; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:44
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v37
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37
+; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:42
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v36
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36
+; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:32
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v51
; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51
-; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:34
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v65
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65
-; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:28
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v64
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64
-; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:26
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v55
-; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:222
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v8
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8
+; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:30
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v50
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50
+; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:28
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v55
+; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:228
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v11
; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55
-; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:24
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v54
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11
+; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:26
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v54
+; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:226
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v10
; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54
-; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:22
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v69
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69
-; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:20
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v68
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68
-; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:18
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v67
+; ALIGNED-NEXT: flat_store_byte v[0:1], v32 offset:1
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10
+; ALIGNED-NEXT: v_add_co_u32 v0, vcc_lo, 0x100, v0
+; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:24
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v53
+; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:224
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v9
+; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:222
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v8
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53
+; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:22
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v52
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52
+; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:20
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v67
; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67
-; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:16
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v66
+; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:18
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v66
; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66
-; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:14
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v81
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81
-; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:12
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v80
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80
-; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:10
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v71
+; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:16
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v65
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65
+; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:14
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v64
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64
+; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:12
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v71
; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71
-; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:8
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v70
+; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:10
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 24, v70
; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70
-; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:6
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v85
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85
-; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:4
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v84
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84
-; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:2
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 24, v83
+; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:8
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v69
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69
+; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:6
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v68
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68
+; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:4
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v83
; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83
-; ALIGNED-NEXT: flat_store_byte v[20:21], v34
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v82
+; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:2
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v82
; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82
-; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:252
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v97
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97
-; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:250
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v96
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96
-; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:248
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v87
+; ALIGNED-NEXT: flat_store_byte v[20:21], v115
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v81
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v80
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80
+; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:252
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v87
; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87
-; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:246
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v86
+; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:250
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v86
; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86
-; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:244
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v19
+; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:248
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v85
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85
+; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:246
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v84
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84
+; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:244
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v19
; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19
-; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:242
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v18
+; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:242
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v18
; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18
-; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:240
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v17
+; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:240
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v17
; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 8, v17
-; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:238
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v16
+; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:238
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v16
; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 8, v16
-; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:236
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v15
+; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:236
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v15
; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15
-; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:234
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v14
+; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:234
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v14
; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14
-; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:232
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v13
+; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:232
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v13
; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13
-; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:230
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v12
+; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:230
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v12
; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12
-; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:228
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v11
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11
-; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:226
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v10
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10
-; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:224
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v9
; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9
-; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:220
-; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:218
-; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:216
-; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:214
-; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:212
-; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:210
-; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:208
-; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:206
-; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:204
-; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:202
-; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:200
-; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:198
-; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:196
-; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:194
-; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:192
-; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:190
-; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:188
-; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:186
-; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:184
-; ALIGNED-NEXT: flat_store_byte v[20:21], v80 offset:182
-; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:180
-; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:178
-; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:176
-; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:174
-; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:172
-; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:170
-; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:168
-; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:166
-; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:164
-; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:162
-; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:160
-; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:158
-; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:156
-; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:154
-; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:152
-; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:150
-; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:148
-; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:146
-; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:144
-; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:142
-; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:124
+; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:220
+; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:218
+; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:216
+; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:214
+; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:212
+; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:210
+; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:208
+; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:206
+; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:204
+; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:202
+; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:200
+; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:198
+; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:196
+; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:194
+; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:192
+; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:190
+; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:188
+; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:186
+; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:184
+; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:182
+; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:180
+; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:178
+; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:176
+; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:174
+; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:172
+; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:170
+; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:168
+; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:166
+; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:164
+; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:162
+; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:160
+; ALIGNED-NEXT: flat_store_byte v[20:21], v80 offset:158
+; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:156
+; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:154
+; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:152
+; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:150
+; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:148
+; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:146
+; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:144
+; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:142
+; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:124
; ALIGNED-NEXT: flat_store_byte v[20:21], v19 offset:122
-; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:120
+; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:120
; ALIGNED-NEXT: flat_store_byte v[20:21], v18 offset:118
-; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:116
+; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:116
; ALIGNED-NEXT: flat_store_byte v[20:21], v17 offset:114
-; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:112
+; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:112
; ALIGNED-NEXT: flat_store_byte v[20:21], v16 offset:110
-; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:108
+; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:108
; ALIGNED-NEXT: flat_store_byte v[20:21], v15 offset:106
-; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:104
+; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:104
; ALIGNED-NEXT: flat_store_byte v[20:21], v14 offset:102
-; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:100
+; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:100
; ALIGNED-NEXT: flat_store_byte v[20:21], v13 offset:98
-; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:96
+; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:96
; ALIGNED-NEXT: flat_store_byte v[20:21], v12 offset:94
-; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:92
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:92
; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:90
-; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:88
+; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:88
; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:86
-; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:84
+; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:84
; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:82
-; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:80
+; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:80
; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:78
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v7 offset:56
@@ -6225,681 +6219,679 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:48
; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:46
; ALIGNED-NEXT: s_cbranch_scc1 .LBB5_2
-; ALIGNED-NEXT: .LBB5_3: ; %Flow5
-; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6
+; ALIGNED-NEXT: .LBB5_3: ; %Flow15
+; ALIGNED-NEXT: s_andn2_saveexec_b32 s6, s6
; ALIGNED-NEXT: s_cbranch_execz .LBB5_6
; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader
-; ALIGNED-NEXT: s_movk_i32 s6, 0xff00
-; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x700
-; ALIGNED-NEXT: s_mov_b32 s7, -1
+; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, 0x700, v2
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v3, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v18, vcc_lo, 0x700, v0
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v1, vcc_lo
+; ALIGNED-NEXT: s_movk_i32 s4, 0xf800
+; ALIGNED-NEXT: s_mov_b32 s5, -1
; ALIGNED-NEXT: .LBB5_5: ; %memmove_bwd_loop
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
-; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v6, vcc_lo, v4, 48
-; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[4:5] offset:128
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v5, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v8, vcc_lo, 0x50, v4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v5, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v12, vcc_lo, 0x60, v4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v5, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, 0x70, v4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v5, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, 0x90, v4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v5, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v52, vcc_lo, 0xa0, v4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v53, null, 0, v5, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v54, vcc_lo, 0xb0, v4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v55, null, 0, v5, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v64, vcc_lo, 0xc0, v4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v65, null, 0, v5, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v66, vcc_lo, 0xd0, v4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v67, null, 0, v5, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v68, vcc_lo, 0xe0, v4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v69, null, 0, v5, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v70, vcc_lo, 0xf0, v4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v71, null, 0, v5, vcc_lo
+; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[16:17] offset:128
+; ALIGNED-NEXT: v_add_co_u32 v0, vcc_lo, v16, 48
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v17, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, 0x50, v16
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v17, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v8, vcc_lo, 0x60, v16
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v17, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v12, vcc_lo, 0x70, v16
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v17, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v37, vcc_lo, 0x90, v16
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v38, null, 0, v17, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v48, vcc_lo, 0xa0, v16
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v49, null, 0, v17, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v50, vcc_lo, 0xb0, v16
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v51, null, 0, v17, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v52, vcc_lo, 0xc0, v16
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v53, null, 0, v17, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v54, vcc_lo, 0xd0, v16
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v55, null, 0, v17, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v64, vcc_lo, 0xe0, v16
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v65, null, 0, v17, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v66, vcc_lo, 0xf0, v16
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v67, null, 0, v17, vcc_lo
; ALIGNED-NEXT: s_clause 0xe
-; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[4:5]
-; ALIGNED-NEXT: flat_load_dwordx4 v[35:38], v[4:5] offset:16
-; ALIGNED-NEXT: flat_load_dwordx4 v[30:33], v[4:5] offset:32
-; ALIGNED-NEXT: flat_load_dwordx4 v[48:51], v[4:5] offset:64
-; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[6:7]
+; ALIGNED-NEXT: flat_load_dwordx4 v[24:27], v[16:17] offset:64
+; ALIGNED-NEXT: flat_load_dwordx4 v[33:36], v[16:17] offset:16
+; ALIGNED-NEXT: flat_load_dwordx4 v[28:31], v[16:17] offset:32
+; ALIGNED-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
-; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[16:17]
-; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[24:25]
-; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[52:53]
-; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[54:55]
-; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[64:65]
-; ALIGNED-NEXT: flat_load_dwordx4 v[55:58], v[66:67]
-; ALIGNED-NEXT: flat_load_dwordx4 v[59:62], v[68:69]
-; ALIGNED-NEXT: flat_load_dwordx4 v[68:71], v[70:71]
-; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v0, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v1, vcc_lo
-; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00
-; ALIGNED-NEXT: s_addc_u32 s5, s5, -1
+; ALIGNED-NEXT: flat_load_dwordx4 v[100:103], v[37:38]
+; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[48:49]
+; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[50:51]
+; ALIGNED-NEXT: flat_load_dwordx4 v[39:42], v[52:53]
+; ALIGNED-NEXT: flat_load_dwordx4 v[43:46], v[54:55]
+; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[64:65]
+; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[66:67]
+; ALIGNED-NEXT: flat_load_dwordx4 v[80:83], v[16:17]
+; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100
+; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
-; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:416
-; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:420
-; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:424
-; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:428
+; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:384
+; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:388
+; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:392
+; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:396
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v102, off, s[0:3], s32 offset:428
-; ALIGNED-NEXT: buffer_load_dword v103, off, s[0:3], s32 offset:424
-; ALIGNED-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:420
-; ALIGNED-NEXT: buffer_load_dword v72, off, s[0:3], s32 offset:416
-; ALIGNED-NEXT: v_add_co_u32 v22, vcc_lo, v24, 6
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v25, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v24, 3
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v25, vcc_lo
-; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7]
+; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:396
+; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:392
+; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:388
+; ALIGNED-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:384
+; ALIGNED-NEXT: v_add_co_u32 v22, vcc_lo, v18, 6
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v19, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v18, 3
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v19, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, 0xffffff00, v16
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, -1, v17, vcc_lo
+; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], 0
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v102 offset:136
-; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:137
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v47 offset:136
+; ALIGNED-NEXT: flat_store_byte v[20:21], v47 offset:137
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v103 offset:132
-; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:133
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v56 offset:132
+; ALIGNED-NEXT: flat_store_byte v[20:21], v56 offset:133
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v63 offset:128
-; ALIGNED-NEXT: flat_store_byte v[20:21], v63 offset:129
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v72 offset:124
-; ALIGNED-NEXT: flat_store_byte v[24:25], v72 offset:128
-; ALIGNED-NEXT: s_waitcnt lgkmcnt(19)
-; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:480
-; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:484
-; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:488
-; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:492
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v57 offset:128
+; ALIGNED-NEXT: flat_store_byte v[20:21], v57 offset:129
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v58 offset:124
+; ALIGNED-NEXT: flat_store_byte v[18:19], v58 offset:128
+; ALIGNED-NEXT: s_waitcnt lgkmcnt(22)
+; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:448
+; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:452
+; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:456
+; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:460
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:492
-; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:488
-; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:484
-; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:480
+; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:460
+; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:456
+; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:452
+; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:448
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v101 offset:72
-; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:73
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v99 offset:72
+; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:73
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v100 offset:68
-; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:69
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v98 offset:68
+; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:69
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v99 offset:64
-; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:65
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v98 offset:60
-; ALIGNED-NEXT: flat_store_byte v[24:25], v98 offset:64
-; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:576
-; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:580
-; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:584
-; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:588
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v97 offset:64
+; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:65
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v96 offset:60
+; ALIGNED-NEXT: flat_store_byte v[18:19], v96 offset:64
+; ALIGNED-NEXT: s_waitcnt lgkmcnt(28)
+; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:544
+; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:548
+; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:552
+; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:556
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:584
-; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:588
-; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:576
-; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:580
+; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:552
+; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:556
+; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:544
+; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:548
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v39 offset:36
-; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:37
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v37 offset:36
+; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:37
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v34 offset:40
-; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:41
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v32 offset:40
+; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:41
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v33 offset:28
-; ALIGNED-NEXT: flat_store_byte v[24:25], v33 offset:32
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v32 offset:32
-; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:33
-; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:528
-; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:532
-; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:536
-; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:540
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v31 offset:28
+; ALIGNED-NEXT: flat_store_byte v[18:19], v31 offset:32
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v30 offset:32
+; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:33
+; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:496
+; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:500
+; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:504
+; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:508
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:540
-; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:536
-; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:532
-; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:528
+; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:508
+; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:504
+; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:500
+; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:496
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v67 offset:24
-; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:25
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v65 offset:24
+; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:25
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v66 offset:20
-; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:21
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v64 offset:20
+; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:21
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v53 offset:16
-; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:17
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v52 offset:12
-; ALIGNED-NEXT: flat_store_byte v[24:25], v52 offset:16
-; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:544
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:548
-; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:552
-; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:556
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v51 offset:16
+; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:17
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v50 offset:12
+; ALIGNED-NEXT: flat_store_byte v[18:19], v50 offset:16
+; ALIGNED-NEXT: s_waitcnt lgkmcnt(32)
+; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:512
+; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:516
+; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:520
+; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:524
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:556
-; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:552
-; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:548
-; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:544
+; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:524
+; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:520
+; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:516
+; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:512
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v86 offset:8
-; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:9
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v84 offset:8
+; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:9
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v82 offset:4
-; ALIGNED-NEXT: flat_store_byte v[24:25], v82 offset:8
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v83 offset:4
+; ALIGNED-NEXT: flat_store_byte v[18:19], v83 offset:8
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v81
-; ALIGNED-NEXT: flat_store_byte v[24:25], v81 offset:4
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[24:25], v80 offset:2
-; ALIGNED-NEXT: flat_store_byte v[24:25], v80
-; ALIGNED-NEXT: s_waitcnt lgkmcnt(40)
-; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:368
-; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:372
-; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:376
-; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:380
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v82
+; ALIGNED-NEXT: flat_store_byte v[18:19], v82 offset:4
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[18:19], v70 offset:2
+; ALIGNED-NEXT: flat_store_byte v[18:19], v70
+; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:336
+; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:340
+; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:344
+; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:348
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:380
-; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:376
-; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:372
-; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:368
+; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:348
+; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:344
+; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:340
+; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:336
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v97 offset:248
-; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:249
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v87 offset:248
+; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:249
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v87 offset:244
-; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:245
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v85 offset:244
+; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:245
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v83 offset:240
-; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:241
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v71 offset:240
+; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:241
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v68 offset:236
-; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:237
-; ALIGNED-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:384
-; ALIGNED-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:388
-; ALIGNED-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:392
-; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v66 offset:236
+; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:237
+; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:352
+; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:356
+; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:360
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:364
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:396
-; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:392
-; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:388
-; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:384
+; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:364
+; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:360
+; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:356
+; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:352
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v96 offset:232
-; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:233
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v86 offset:232
+; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:233
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v84 offset:228
-; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:229
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v80 offset:228
+; ALIGNED-NEXT: flat_store_byte v[20:21], v80 offset:229
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v69 offset:224
-; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:225
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v54 offset:220
-; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:221
-; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:336
-; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:340
-; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:344
-; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:348
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v67 offset:224
+; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:225
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v52 offset:220
+; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:221
+; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:304
+; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:308
+; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:312
+; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:316
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:348
-; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:344
-; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:340
-; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:336
+; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:316
+; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:312
+; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:308
+; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v85 offset:216
-; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:217
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v81 offset:216
+; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:217
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v70 offset:212
-; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:213
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v68 offset:212
+; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:213
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v55 offset:208
-; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:209
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v48 offset:204
-; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:205
-; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352
-; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:356
-; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:360
-; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:364
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v53 offset:208
+; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:209
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v38 offset:204
+; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:205
+; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:320
+; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:324
+; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:328
+; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:332
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:364
-; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:360
-; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:356
-; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:352
+; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:332
+; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:328
+; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:324
+; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:320
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v71 offset:200
-; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:201
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v69 offset:200
+; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:201
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v64 offset:196
-; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:197
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v54 offset:196
+; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:197
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v49 offset:192
-; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:193
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v35 offset:188
-; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:189
-; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:432
-; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:436
-; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:440
-; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:444
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v39 offset:192
+; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:193
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v33 offset:188
+; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:189
+; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:400
+; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:404
+; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:408
+; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:412
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:444
-; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:440
-; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:436
-; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:432
+; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:412
+; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:408
+; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:404
+; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:400
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v65 offset:184
-; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:185
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v55 offset:184
+; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:185
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v50 offset:180
-; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:181
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v48 offset:180
+; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:181
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v36 offset:176
-; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:177
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v29 offset:172
-; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:173
-; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:448
-; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:452
-; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:456
-; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:460
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v34 offset:176
+; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:177
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v27 offset:172
+; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:173
+; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:416
+; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:420
+; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:424
+; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:428
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:460
-; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:456
-; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:452
-; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:448
+; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:428
+; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:424
+; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:420
+; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:416
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v57
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v57
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v58
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v58
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v51 offset:168
-; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:169
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v49 offset:168
+; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:169
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v37 offset:164
-; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:165
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v35 offset:164
+; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:165
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v30 offset:160
-; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:161
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v27 offset:156
-; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:157
-; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:400
-; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:404
-; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:408
-; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:412
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v28 offset:160
+; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:161
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v25 offset:156
+; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:157
+; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:368
+; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:372
+; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:376
+; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:380
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:412
-; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:408
-; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:404
-; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:400
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v102
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v102
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v103
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v103
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v63
-; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:140
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v63
-; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:138
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v72
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v72
-; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:136
+; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:380
+; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:376
+; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:372
+; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:368
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v47
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v47
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v56
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v56
+; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:140
+; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:138
+; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:136
; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:134
; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:132
-; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:130
-; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:128
+; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:130
+; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:128
; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:126
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v38 offset:152
-; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:153
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v36 offset:152
+; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:153
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v31 offset:148
-; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:149
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v29 offset:148
+; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:149
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v28 offset:144
-; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:145
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v26 offset:140
-; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:141
-; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:496
-; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:500
-; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:504
-; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:508
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v26 offset:144
+; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:145
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v24 offset:140
+; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:141
+; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:464
+; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:468
+; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:472
+; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:476
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:508
-; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:504
-; ALIGNED-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:500
-; ALIGNED-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:496
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v101
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v100
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v99
+; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:476
+; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472
+; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:468
+; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:464
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v99
; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v98
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v98
; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v80
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v86
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v80
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v97
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v96
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v70
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v84
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v83
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v82
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v19 offset:120
-; ALIGNED-NEXT: flat_store_byte v[20:21], v19 offset:121
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v15 offset:120
+; ALIGNED-NEXT: flat_store_byte v[20:21], v15 offset:121
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v18 offset:116
-; ALIGNED-NEXT: flat_store_byte v[20:21], v18 offset:117
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v14 offset:116
+; ALIGNED-NEXT: flat_store_byte v[20:21], v14 offset:117
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v17 offset:112
-; ALIGNED-NEXT: flat_store_byte v[20:21], v17 offset:113
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v13 offset:112
+; ALIGNED-NEXT: flat_store_byte v[20:21], v13 offset:113
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v16 offset:108
-; ALIGNED-NEXT: flat_store_byte v[20:21], v16 offset:109
-; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:512
-; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:516
-; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:520
-; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:524
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v12 offset:108
+; ALIGNED-NEXT: flat_store_byte v[20:21], v12 offset:109
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480
+; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:484
+; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488
+; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:492
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:524
-; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:520
-; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:516
-; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512
+; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492
+; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:488
+; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:484
+; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:480
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v15 offset:104
-; ALIGNED-NEXT: flat_store_byte v[20:21], v15 offset:105
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v11 offset:104
+; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:105
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v14 offset:100
-; ALIGNED-NEXT: flat_store_byte v[20:21], v14 offset:101
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v10 offset:100
+; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:101
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v13 offset:96
-; ALIGNED-NEXT: flat_store_byte v[20:21], v13 offset:97
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v12 offset:92
-; ALIGNED-NEXT: flat_store_byte v[20:21], v12 offset:93
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:464
-; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:468
-; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:472
-; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:476
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v9 offset:96
+; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:97
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v8 offset:92
+; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:93
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:432
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:440
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:444
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:476
-; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:472
-; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:468
-; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:464
-; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:76
-; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:74
-; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:72
-; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:70
-; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:68
-; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:66
-; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:64
-; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:62
+; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:444
+; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:440
+; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:436
+; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:432
+; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:76
+; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:74
+; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:72
+; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:70
+; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:68
+; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:66
+; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:64
+; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:62
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v11 offset:88
-; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:89
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v7 offset:88
+; ALIGNED-NEXT: flat_store_byte v[20:21], v7 offset:89
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v10 offset:84
-; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:85
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v6 offset:84
+; ALIGNED-NEXT: flat_store_byte v[20:21], v6 offset:85
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v9 offset:80
-; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:81
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v5 offset:80
+; ALIGNED-NEXT: flat_store_byte v[20:21], v5 offset:81
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v8 offset:76
-; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:77
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:564
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:568
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:572
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v4 offset:76
+; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:77
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:572
-; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:568
-; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:564
-; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:560
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v32
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v39
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v33
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33
+; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540
+; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536
+; ALIGNED-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v30
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v37
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v32
; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v67
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v31
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v65
+; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:36
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v67
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65
; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67
-; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:36
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v54
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v66
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v53
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v52
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v64
+; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:34
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v52
; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52
-; ALIGNED-NEXT: flat_store_byte v[24:25], v114 offset:1
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v82
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v82
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v81
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v97
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97
-; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:40
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v87
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v51
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v50
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50
+; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:40
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v87
; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87
-; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:38
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v83
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83
-; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:44
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v68
+; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:38
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v85
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85
+; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:44
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v71
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71
+; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:42
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v66
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66
+; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:32
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v86
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86
+; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:30
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v80
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80
+; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:28
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v81
+; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:228
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v7
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7
+; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:26
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v68
+; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:226
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v6
; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68
-; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:42
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v96
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96
-; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:32
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v84
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84
-; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:30
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v69
+; ALIGNED-NEXT: flat_store_byte v[18:19], v70 offset:1
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v6, 8, v6
+; ALIGNED-NEXT: v_add_co_u32 v18, vcc_lo, 0xffffff00, v18
+; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:24
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v53
+; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:224
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v5
+; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:222
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v4
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53
+; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:22
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v38
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38
+; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:20
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v69
; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69
-; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:34
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v85
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85
-; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:28
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v70
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70
-; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:26
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v55
-; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:222
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v8
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8
+; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:18
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v54
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54
+; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:16
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v39
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39
+; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:14
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v33
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33
+; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:12
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v55
; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55
-; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:24
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v48
+; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:10
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v48
; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48
-; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:22
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v71
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71
-; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:20
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v64
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64
-; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:18
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v49
+; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:8
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v34
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34
+; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:6
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v27
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27
+; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:4
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v49
; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49
-; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:16
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v35
+; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:2
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v35
; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35
-; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:14
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v65
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65
-; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:12
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50
-; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:10
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v36
+; ALIGNED-NEXT: flat_store_byte v[20:21], v115
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v28
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v25
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25
+; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:252
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v36
; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36
-; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:8
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v29
+; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:250
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v29
; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29
-; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:6
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v51
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51
-; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:4
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v37
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37
-; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:2
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v30
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30
-; ALIGNED-NEXT: flat_store_byte v[20:21], v80
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v27
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27
-; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:252
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38
-; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:250
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v31
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31
-; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:248
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v28
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28
-; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:246
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v26
+; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:248
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v26
; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26
-; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:244
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v19
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19
-; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:242
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v18
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18
-; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:240
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v17
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 8, v17
-; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:238
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v16
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 8, v16
-; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:236
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v15
+; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:246
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v24
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24
+; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:244
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v15
; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15
-; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:234
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v14
+; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:242
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v14
; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14
-; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:232
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v13
+; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:240
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v13
; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13
-; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:230
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v12
+; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:238
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v12
; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12
-; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:228
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v11
+; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:236
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v11
; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11
-; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:226
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v10
+; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:234
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v10
; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10
-; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:224
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v9
+; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:232
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v9
; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9
-; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:220
-; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:218
-; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:216
-; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:214
-; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:212
-; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:210
-; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:208
-; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:206
-; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:204
-; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:202
-; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:200
-; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:198
-; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:196
-; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:194
-; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:192
-; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:190
-; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:188
-; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:186
-; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:184
-; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:182
-; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:180
-; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:178
-; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:176
-; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:174
-; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:172
-; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:170
+; ALIGNED-NEXT: flat_store_byte v[20:21], v80 offset:230
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v8
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5
+; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:220
+; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:218
+; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:216
+; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:214
+; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:212
+; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:210
+; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:208
+; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:206
+; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:204
+; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:202
+; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:200
+; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:198
+; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:196
+; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:194
+; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:192
+; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:190
+; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:188
+; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:186
+; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:184
+; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:182
+; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:180
+; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:178
+; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:176
+; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:174
+; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:172
+; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:170
; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:168
-; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:166
-; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:164
-; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:162
-; ALIGNED-NEXT: flat_store_byte v[20:21], v80 offset:160
-; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:158
-; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:156
-; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:154
-; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:152
-; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:150
-; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:148
-; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:146
-; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:144
-; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:142
-; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:124
-; ALIGNED-NEXT: flat_store_byte v[20:21], v19 offset:122
-; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:120
-; ALIGNED-NEXT: flat_store_byte v[20:21], v18 offset:118
-; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:116
-; ALIGNED-NEXT: flat_store_byte v[20:21], v17 offset:114
-; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:112
-; ALIGNED-NEXT: flat_store_byte v[20:21], v16 offset:110
-; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:108
-; ALIGNED-NEXT: flat_store_byte v[20:21], v15 offset:106
-; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:104
-; ALIGNED-NEXT: flat_store_byte v[20:21], v14 offset:102
-; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:100
-; ALIGNED-NEXT: flat_store_byte v[20:21], v13 offset:98
-; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:96
-; ALIGNED-NEXT: flat_store_byte v[20:21], v12 offset:94
-; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:92
-; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:90
-; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:88
-; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:86
-; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:84
-; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:82
-; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:80
-; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:78
+; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:166
+; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:164
+; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:162
+; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:160
+; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:158
+; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:156
+; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:154
+; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:152
+; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:150
+; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:148
+; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:146
+; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:144
+; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:142
+; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:124
+; ALIGNED-NEXT: flat_store_byte v[20:21], v15 offset:122
+; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:120
+; ALIGNED-NEXT: flat_store_byte v[20:21], v14 offset:118
+; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:116
+; ALIGNED-NEXT: flat_store_byte v[20:21], v13 offset:114
+; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:112
+; ALIGNED-NEXT: flat_store_byte v[20:21], v12 offset:110
+; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:108
+; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:106
+; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:104
+; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:102
+; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:100
+; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:98
+; ALIGNED-NEXT: flat_store_byte v[20:21], v80 offset:96
+; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:94
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v19, null, -1, v19, vcc_lo
+; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:92
+; ALIGNED-NEXT: flat_store_byte v[20:21], v7 offset:90
+; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:88
+; ALIGNED-NEXT: flat_store_byte v[20:21], v6 offset:86
+; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:84
+; ALIGNED-NEXT: flat_store_byte v[20:21], v5 offset:82
+; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:80
+; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:78
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v7 offset:56
-; ALIGNED-NEXT: flat_store_byte v[20:21], v7 offset:57
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v3 offset:56
+; ALIGNED-NEXT: flat_store_byte v[20:21], v3 offset:57
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v6 offset:52
-; ALIGNED-NEXT: flat_store_byte v[20:21], v6 offset:53
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v2 offset:52
+; ALIGNED-NEXT: flat_store_byte v[20:21], v2 offset:53
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v5 offset:48
-; ALIGNED-NEXT: flat_store_byte v[20:21], v5 offset:49
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v4 offset:44
-; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:45
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v6, 8, v6
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 24, v5
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4
-; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:60
-; ALIGNED-NEXT: flat_store_byte v[20:21], v7 offset:58
-; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:56
-; ALIGNED-NEXT: flat_store_byte v[20:21], v6 offset:54
-; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:52
-; ALIGNED-NEXT: flat_store_byte v[20:21], v5 offset:50
-; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:48
-; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:46
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v1 offset:48
+; ALIGNED-NEXT: flat_store_byte v[20:21], v1 offset:49
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v0 offset:44
+; ALIGNED-NEXT: flat_store_byte v[20:21], v0 offset:45
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 24, v3
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v3, 8, v3
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 24, v2
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v2, 8, v2
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v6, 24, v1
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 24, v0
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v0, 8, v0
+; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:60
+; ALIGNED-NEXT: flat_store_byte v[20:21], v3 offset:58
+; ALIGNED-NEXT: flat_store_byte v[20:21], v5 offset:56
+; ALIGNED-NEXT: flat_store_byte v[20:21], v2 offset:54
+; ALIGNED-NEXT: flat_store_byte v[20:21], v6 offset:52
+; ALIGNED-NEXT: flat_store_byte v[20:21], v1 offset:50
+; ALIGNED-NEXT: flat_store_byte v[20:21], v7 offset:48
+; ALIGNED-NEXT: flat_store_byte v[20:21], v0 offset:46
; ALIGNED-NEXT: s_cbranch_scc0 .LBB5_5
-; ALIGNED-NEXT: .LBB5_6: ; %Flow6
-; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8
-; ALIGNED-NEXT: s_clause 0x10 ; 68-byte Folded Reload
-; ALIGNED-NEXT: buffer_load_dword v72, off, s[0:3], s32
-; ALIGNED-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4
-; ALIGNED-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8
-; ALIGNED-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12
-; ALIGNED-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16
-; ALIGNED-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20
-; ALIGNED-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24
-; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28
-; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32
-; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36
-; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40
-; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44
-; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48
-; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52
-; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56
-; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60
-; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64
+; ALIGNED-NEXT: .LBB5_6: ; %Flow16
+; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; ALIGNED-NEXT: s_clause 0xa ; 44-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v58, off, s[0:3], s32
+; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4
+; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:8
+; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12
+; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16
+; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20
+; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24
+; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:28
+; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32
+; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36
+; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40
; ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; ALIGNED-NEXT: s_setpc_b64 s[30:31]
;
@@ -6911,27 +6903,31 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; UNROLL3-NEXT: s_xor_b32 s6, exec_lo, s4
; UNROLL3-NEXT: s_cbranch_execz .LBB5_4
; UNROLL3-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader
-; UNROLL3-NEXT: s_mov_b64 s[4:5], 0
+; UNROLL3-NEXT: v_mov_b32_e32 v5, v3
+; UNROLL3-NEXT: v_mov_b32_e32 v7, v1
+; UNROLL3-NEXT: v_mov_b32_e32 v4, v2
+; UNROLL3-NEXT: v_mov_b32_e32 v6, v0
+; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7e0
; UNROLL3-NEXT: .p2align 6
; UNROLL3-NEXT: .LBB5_2: ; %memmove_fwd_loop
; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1
-; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo
-; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
; UNROLL3-NEXT: s_clause 0x2
-; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[12:13]
-; UNROLL3-NEXT: flat_load_dwordx4 v[8:11], v[12:13] offset:16
-; UNROLL3-NEXT: flat_load_dwordx4 v[12:15], v[12:13] offset:32
-; UNROLL3-NEXT: s_add_u32 s4, s4, 48
-; UNROLL3-NEXT: s_addc_u32 s5, s5, 0
+; UNROLL3-NEXT: flat_load_dwordx4 v[8:11], v[4:5]
+; UNROLL3-NEXT: flat_load_dwordx4 v[12:15], v[4:5] offset:16
+; UNROLL3-NEXT: flat_load_dwordx4 v[16:19], v[4:5] offset:32
+; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, v4, 48
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0
+; UNROLL3-NEXT: s_addc_u32 s5, s5, -1
; UNROLL3-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7]
+; UNROLL3-NEXT: flat_store_dwordx4 v[6:7], v[8:11]
; UNROLL3-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2)
-; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11] offset:16
+; UNROLL3-NEXT: flat_store_dwordx4 v[6:7], v[12:15] offset:16
; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2)
-; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32
-; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0x7e0
+; UNROLL3-NEXT: flat_store_dwordx4 v[6:7], v[16:19] offset:32
+; UNROLL3-NEXT: v_add_co_u32 v6, vcc_lo, v6, 48
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0
; UNROLL3-NEXT: s_cbranch_scc1 .LBB5_2
; UNROLL3-NEXT: ; %bb.3: ; %memmove_fwd_residual
; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:2016
@@ -6940,44 +6936,47 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
; UNROLL3-NEXT: flat_load_dwordx4 v[2:5], v[2:3] offset:2032
; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[2:5] offset:2032
-; UNROLL3-NEXT: ; implicit-def: $vgpr2_vgpr3
-; UNROLL3-NEXT: ; implicit-def: $vgpr0_vgpr1
-; UNROLL3-NEXT: .LBB5_4: ; %Flow3
-; UNROLL3-NEXT: s_andn2_saveexec_b32 s8, s6
+; UNROLL3-NEXT: ; implicit-def: $vgpr2
+; UNROLL3-NEXT: ; implicit-def: $vgpr0
+; UNROLL3-NEXT: .LBB5_4: ; %Flow13
+; UNROLL3-NEXT: s_andn2_saveexec_b32 s6, s6
; UNROLL3-NEXT: s_cbranch_execz .LBB5_7
; UNROLL3-NEXT: ; %bb.5: ; %memmove_bwd_residual
; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:2032
-; UNROLL3-NEXT: s_movk_i32 s6, 0xffd0
-; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7b0
-; UNROLL3-NEXT: s_mov_b32 s7, -1
+; UNROLL3-NEXT: s_movk_i32 s4, 0xf820
+; UNROLL3-NEXT: s_mov_b32 s5, -1
; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:2032
-; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:2016
+; UNROLL3-NEXT: flat_load_dwordx4 v[6:9], v[2:3] offset:2016
+; UNROLL3-NEXT: v_add_co_u32 v2, vcc_lo, 0x7b0, v2
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, 0x7b0, v0
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo
; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:2016
+; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[6:9] offset:2016
; UNROLL3-NEXT: .p2align 6
; UNROLL3-NEXT: .LBB5_6: ; %memmove_bwd_loop
; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1
-; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo
-; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
; UNROLL3-NEXT: s_clause 0x2
-; UNROLL3-NEXT: flat_load_dwordx4 v[4:7], v[12:13]
-; UNROLL3-NEXT: flat_load_dwordx4 v[8:11], v[12:13] offset:16
-; UNROLL3-NEXT: flat_load_dwordx4 v[12:15], v[12:13] offset:32
-; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0
-; UNROLL3-NEXT: s_addc_u32 s5, s5, -1
+; UNROLL3-NEXT: flat_load_dwordx4 v[6:9], v[2:3]
+; UNROLL3-NEXT: flat_load_dwordx4 v[10:13], v[2:3] offset:16
+; UNROLL3-NEXT: flat_load_dwordx4 v[14:17], v[2:3] offset:32
+; UNROLL3-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffffd0, v2
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
+; UNROLL3-NEXT: s_add_u32 s4, s4, 48
+; UNROLL3-NEXT: s_addc_u32 s5, s5, 0
; UNROLL3-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7]
+; UNROLL3-NEXT: flat_store_dwordx4 v[4:5], v[6:9]
; UNROLL3-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2)
-; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11] offset:16
+; UNROLL3-NEXT: flat_store_dwordx4 v[4:5], v[10:13] offset:16
; UNROLL3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2)
-; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32
-; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], s[6:7]
+; UNROLL3-NEXT: flat_store_dwordx4 v[4:5], v[14:17] offset:32
+; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, 0xffffffd0, v4
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, vcc_lo
+; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], 0
; UNROLL3-NEXT: s_cbranch_scc0 .LBB5_6
-; UNROLL3-NEXT: .LBB5_7: ; %Flow4
-; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; UNROLL3-NEXT: .LBB5_7: ; %Flow14
+; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s6
; UNROLL3-NEXT: s_waitcnt lgkmcnt(0)
; UNROLL3-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -6994,756 +6993,759 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
; CHECK-NEXT: s_xor_b32 s6, exec_lo, s4
; CHECK-NEXT: s_cbranch_execz .LBB6_3
; CHECK-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader
-; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: s_mov_b64 s[4:5], 0x800
; CHECK-NEXT: .LBB6_2: ; %memmove_fwd_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4
-; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo
-; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4
-; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo
; CHECK-NEXT: s_clause 0xf
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v[96:97], off offset:224
-; CHECK-NEXT: global_load_dwordx4 v[8:11], v[96:97], off offset:240
-; CHECK-NEXT: global_load_dwordx4 v[12:15], v[96:97], off offset:192
-; CHECK-NEXT: global_load_dwordx4 v[16:19], v[96:97], off offset:208
-; CHECK-NEXT: global_load_dwordx4 v[20:23], v[96:97], off offset:160
-; CHECK-NEXT: global_load_dwordx4 v[24:27], v[96:97], off offset:176
-; CHECK-NEXT: global_load_dwordx4 v[28:31], v[96:97], off offset:128
-; CHECK-NEXT: global_load_dwordx4 v[32:35], v[96:97], off offset:144
-; CHECK-NEXT: global_load_dwordx4 v[36:39], v[96:97], off offset:96
-; CHECK-NEXT: global_load_dwordx4 v[48:51], v[96:97], off offset:112
-; CHECK-NEXT: global_load_dwordx4 v[52:55], v[96:97], off offset:64
-; CHECK-NEXT: global_load_dwordx4 v[64:67], v[96:97], off offset:80
-; CHECK-NEXT: global_load_dwordx4 v[68:71], v[96:97], off offset:32
-; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:48
-; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off
-; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off offset:16
-; CHECK-NEXT: s_add_u32 s4, s4, 0x100
-; CHECK-NEXT: s_addc_u32 s5, s5, 0
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:224
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:240
+; CHECK-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:192
+; CHECK-NEXT: global_load_dwordx4 v[16:19], v[2:3], off offset:208
+; CHECK-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:160
+; CHECK-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:176
+; CHECK-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:128
+; CHECK-NEXT: global_load_dwordx4 v[32:35], v[2:3], off offset:144
+; CHECK-NEXT: global_load_dwordx4 v[36:39], v[2:3], off offset:96
+; CHECK-NEXT: global_load_dwordx4 v[48:51], v[2:3], off offset:112
+; CHECK-NEXT: global_load_dwordx4 v[52:55], v[2:3], off offset:64
+; CHECK-NEXT: global_load_dwordx4 v[64:67], v[2:3], off offset:80
+; CHECK-NEXT: global_load_dwordx4 v[68:71], v[2:3], off offset:32
+; CHECK-NEXT: global_load_dwordx4 v[80:83], v[2:3], off offset:48
+; CHECK-NEXT: global_load_dwordx4 v[84:87], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 v[96:99], v[2:3], off offset:16
+; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, 0x100, v2
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00
+; CHECK-NEXT: s_addc_u32 s5, s5, -1
; CHECK-NEXT: s_waitcnt vmcnt(15)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[4:7], off offset:224
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:224
; CHECK-NEXT: s_waitcnt vmcnt(14)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[8:11], off offset:240
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off offset:240
; CHECK-NEXT: s_waitcnt vmcnt(13)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[12:15], off offset:192
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[12:15], off offset:192
; CHECK-NEXT: s_waitcnt vmcnt(12)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[16:19], off offset:208
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[16:19], off offset:208
; CHECK-NEXT: s_waitcnt vmcnt(11)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[20:23], off offset:160
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[20:23], off offset:160
; CHECK-NEXT: s_waitcnt vmcnt(10)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[24:27], off offset:176
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[24:27], off offset:176
; CHECK-NEXT: s_waitcnt vmcnt(9)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[28:31], off offset:128
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[28:31], off offset:128
; CHECK-NEXT: s_waitcnt vmcnt(8)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[32:35], off offset:144
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[32:35], off offset:144
; CHECK-NEXT: s_waitcnt vmcnt(7)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[36:39], off offset:96
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[36:39], off offset:96
; CHECK-NEXT: s_waitcnt vmcnt(6)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[48:51], off offset:112
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[48:51], off offset:112
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[52:55], off offset:64
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[52:55], off offset:64
; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[64:67], off offset:80
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[64:67], off offset:80
; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[68:71], off offset:32
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[68:71], off offset:32
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[80:83], off offset:48
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[80:83], off offset:48
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[84:87], off
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[84:87], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[96:99], off offset:16
-; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[96:99], off offset:16
+; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0x100, v0
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0
; CHECK-NEXT: s_cbranch_scc1 .LBB6_2
-; CHECK-NEXT: .LBB6_3: ; %Flow7
-; CHECK-NEXT: s_andn2_saveexec_b32 s8, s6
+; CHECK-NEXT: .LBB6_3: ; %Flow17
+; CHECK-NEXT: s_andn2_saveexec_b32 s6, s6
; CHECK-NEXT: s_cbranch_execz .LBB6_6
; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader
-; CHECK-NEXT: s_movk_i32 s6, 0xff00
-; CHECK-NEXT: s_mov_b64 s[4:5], 0x700
-; CHECK-NEXT: s_mov_b32 s7, -1
+; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, 0x700, v2
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0x700, v0
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; CHECK-NEXT: s_movk_i32 s4, 0xf800
+; CHECK-NEXT: s_mov_b32 s5, -1
; CHECK-NEXT: .LBB6_5: ; %memmove_bwd_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4
-; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo
-; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4
-; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo
; CHECK-NEXT: s_clause 0xf
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v[96:97], off offset:224
-; CHECK-NEXT: global_load_dwordx4 v[8:11], v[96:97], off offset:240
-; CHECK-NEXT: global_load_dwordx4 v[12:15], v[96:97], off offset:192
-; CHECK-NEXT: global_load_dwordx4 v[16:19], v[96:97], off offset:208
-; CHECK-NEXT: global_load_dwordx4 v[20:23], v[96:97], off offset:160
-; CHECK-NEXT: global_load_dwordx4 v[24:27], v[96:97], off offset:176
-; CHECK-NEXT: global_load_dwordx4 v[28:31], v[96:97], off offset:128
-; CHECK-NEXT: global_load_dwordx4 v[32:35], v[96:97], off offset:144
-; CHECK-NEXT: global_load_dwordx4 v[36:39], v[96:97], off offset:96
-; CHECK-NEXT: global_load_dwordx4 v[48:51], v[96:97], off offset:112
-; CHECK-NEXT: global_load_dwordx4 v[52:55], v[96:97], off offset:64
-; CHECK-NEXT: global_load_dwordx4 v[64:67], v[96:97], off offset:80
-; CHECK-NEXT: global_load_dwordx4 v[68:71], v[96:97], off offset:32
-; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:48
-; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off
-; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off offset:16
-; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00
-; CHECK-NEXT: s_addc_u32 s5, s5, -1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:224
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:240
+; CHECK-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:192
+; CHECK-NEXT: global_load_dwordx4 v[16:19], v[2:3], off offset:208
+; CHECK-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:160
+; CHECK-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:176
+; CHECK-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:128
+; CHECK-NEXT: global_load_dwordx4 v[32:35], v[2:3], off offset:144
+; CHECK-NEXT: global_load_dwordx4 v[36:39], v[2:3], off offset:96
+; CHECK-NEXT: global_load_dwordx4 v[48:51], v[2:3], off offset:112
+; CHECK-NEXT: global_load_dwordx4 v[52:55], v[2:3], off offset:64
+; CHECK-NEXT: global_load_dwordx4 v[64:67], v[2:3], off offset:80
+; CHECK-NEXT: global_load_dwordx4 v[68:71], v[2:3], off offset:32
+; CHECK-NEXT: global_load_dwordx4 v[80:83], v[2:3], off offset:48
+; CHECK-NEXT: global_load_dwordx4 v[84:87], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 v[96:99], v[2:3], off offset:16
+; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff00, v2
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
+; CHECK-NEXT: s_add_u32 s4, s4, 0x100
+; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: s_waitcnt vmcnt(15)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[4:7], off offset:224
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:224
; CHECK-NEXT: s_waitcnt vmcnt(14)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[8:11], off offset:240
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off offset:240
; CHECK-NEXT: s_waitcnt vmcnt(13)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[12:15], off offset:192
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[12:15], off offset:192
; CHECK-NEXT: s_waitcnt vmcnt(12)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[16:19], off offset:208
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[16:19], off offset:208
; CHECK-NEXT: s_waitcnt vmcnt(11)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[20:23], off offset:160
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[20:23], off offset:160
; CHECK-NEXT: s_waitcnt vmcnt(10)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[24:27], off offset:176
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[24:27], off offset:176
; CHECK-NEXT: s_waitcnt vmcnt(9)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[28:31], off offset:128
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[28:31], off offset:128
; CHECK-NEXT: s_waitcnt vmcnt(8)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[32:35], off offset:144
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[32:35], off offset:144
; CHECK-NEXT: s_waitcnt vmcnt(7)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[36:39], off offset:96
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[36:39], off offset:96
; CHECK-NEXT: s_waitcnt vmcnt(6)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[48:51], off offset:112
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[48:51], off offset:112
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[52:55], off offset:64
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[52:55], off offset:64
; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[64:67], off offset:80
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[64:67], off offset:80
; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[68:71], off offset:32
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[68:71], off offset:32
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[80:83], off offset:48
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[80:83], off offset:48
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[84:87], off
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[84:87], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v[100:101], v[96:99], off offset:16
-; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7]
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[96:99], off offset:16
+; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffff00, v0
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB6_5
-; CHECK-NEXT: .LBB6_6: ; %Flow8
-; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT: .LBB6_6: ; %Flow18
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_setpc_b64 s[30:31]
;
; ALIGNED-LABEL: memmove_p1_p1_sz2048:
; ALIGNED: ; %bb.0: ; %entry
; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_mov_b32 s4, exec_lo
; ALIGNED-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1]
; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s4
; ALIGNED-NEXT: s_cbranch_execz .LBB6_3
; ALIGNED-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader
-; ALIGNED-NEXT: s_mov_b64 s[4:5], 0
+; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x800
; ALIGNED-NEXT: .LBB6_2: ; %memmove_fwd_loop
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
-; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v2, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, s5, v3, vcc_lo
; ALIGNED-NEXT: s_clause 0xf
-; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[20:21], off offset:240
-; ALIGNED-NEXT: global_load_dwordx4 v[22:25], v[20:21], off offset:224
-; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[20:21], off
-; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[20:21], off offset:16
-; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[20:21], off offset:32
-; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[20:21], off offset:48
-; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[20:21], off offset:64
-; ALIGNED-NEXT: global_load_dwordx4 v[82:85], v[20:21], off offset:80
-; ALIGNED-NEXT: global_load_dwordx4 v[116:119], v[20:21], off offset:96
-; ALIGNED-NEXT: global_load_dwordx4 v[66:69], v[20:21], off offset:112
-; ALIGNED-NEXT: global_load_dwordx4 v[40:43], v[20:21], off offset:128
-; ALIGNED-NEXT: global_load_dwordx4 v[50:53], v[20:21], off offset:144
-; ALIGNED-NEXT: global_load_dwordx4 v[44:47], v[20:21], off offset:160
-; ALIGNED-NEXT: global_load_dwordx4 v[34:37], v[20:21], off offset:176
-; ALIGNED-NEXT: global_load_dwordx4 v[30:33], v[20:21], off offset:192
-; ALIGNED-NEXT: global_load_dwordx4 v[26:29], v[20:21], off offset:208
+; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[2:3], off offset:240
+; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:224
+; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:16
+; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:32
+; ALIGNED-NEXT: global_load_dwordx4 v[96:99], v[2:3], off offset:48
+; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[2:3], off offset:64
+; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[2:3], off offset:80
+; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[2:3], off offset:96
+; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[2:3], off offset:112
+; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[2:3], off offset:128
+; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[2:3], off offset:144
+; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[2:3], off offset:160
+; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[2:3], off offset:176
+; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:192
+; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:208
+; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, 0x100, v2
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00
+; ALIGNED-NEXT: s_addc_u32 s5, s5, -1
; ALIGNED-NEXT: s_waitcnt vmcnt(15)
-; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64
-; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68
-; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72
-; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76
+; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80
+; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84
+; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:88
+; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76
-; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72
-; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68
-; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64
-; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
-; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100
-; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
+; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92
+; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:88
+; ALIGNED-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84
+; ALIGNED-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80
+; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v21, off offset:254
-; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:252
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v19, off offset:254
+; ALIGNED-NEXT: global_store_byte v[0:1], v19, off offset:252
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v20, off offset:250
-; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:248
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v18, off offset:250
+; ALIGNED-NEXT: global_store_byte v[0:1], v18, off offset:248
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v19, off offset:246
-; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:244
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v18, off offset:242
-; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:240
-; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80
-; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84
-; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88
-; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v17, off offset:246
+; ALIGNED-NEXT: global_store_byte v[0:1], v17, off offset:244
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v16, off offset:242
+; ALIGNED-NEXT: global_store_byte v[0:1], v16, off offset:240
+; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96
+; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100
+; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104
+; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92
-; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88
-; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84
-; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20
-; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800
+; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108
+; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:104
+; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100
+; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:96
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v18
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v17
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 8, v17
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v16
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 8, v16
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v25, off offset:238
-; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:236
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v23, off offset:238
+; ALIGNED-NEXT: global_store_byte v[0:1], v23, off offset:236
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v24, off offset:234
-; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:232
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v22, off offset:234
+; ALIGNED-NEXT: global_store_byte v[0:1], v22, off offset:232
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v23, off offset:230
-; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:228
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v22, off offset:226
-; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:224
-; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:32
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:36
-; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:40
-; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:44
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v21, off offset:230
+; ALIGNED-NEXT: global_store_byte v[0:1], v21, off offset:228
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v20, off offset:226
+; ALIGNED-NEXT: global_store_byte v[0:1], v20, off offset:224
+; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:48
+; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:52
+; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:56
+; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:60
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:44
-; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:40
-; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36
-; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32
+; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60
+; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:56
+; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52
+; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v23
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v22
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v21
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v20
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v29, off offset:222
-; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:220
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v27, off offset:222
+; ALIGNED-NEXT: global_store_byte v[0:1], v27, off offset:220
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v28, off offset:218
-; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:216
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v26, off offset:218
+; ALIGNED-NEXT: global_store_byte v[0:1], v26, off offset:216
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v27, off offset:214
-; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:212
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v26, off offset:210
-; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:208
-; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:48
-; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:52
-; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:56
-; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:60
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v25, off offset:214
+; ALIGNED-NEXT: global_store_byte v[0:1], v25, off offset:212
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v24, off offset:210
+; ALIGNED-NEXT: global_store_byte v[0:1], v24, off offset:208
+; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:64
+; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68
+; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:72
+; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
-; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
-; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52
-; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48
+; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76
+; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72
+; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68
+; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v33, off offset:206
-; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:204
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v31, off offset:206
+; ALIGNED-NEXT: global_store_byte v[0:1], v31, off offset:204
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v32, off offset:202
-; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:200
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v30, off offset:202
+; ALIGNED-NEXT: global_store_byte v[0:1], v30, off offset:200
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v31, off offset:198
-; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:196
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v30, off offset:194
-; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:192
-; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128
-; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132
-; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:136
-; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:140
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v29, off offset:198
+; ALIGNED-NEXT: global_store_byte v[0:1], v29, off offset:196
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v28, off offset:194
+; ALIGNED-NEXT: global_store_byte v[0:1], v28, off offset:192
+; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:144
+; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148
+; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152
+; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:156
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140
-; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136
-; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132
-; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128
+; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:156
+; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:152
+; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148
+; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v37, off offset:190
-; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:188
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v35, off offset:190
+; ALIGNED-NEXT: global_store_byte v[0:1], v35, off offset:188
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v36, off offset:186
-; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:184
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v34, off offset:186
+; ALIGNED-NEXT: global_store_byte v[0:1], v34, off offset:184
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v35, off offset:182
-; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:180
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v34, off offset:178
-; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:176
-; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144
-; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:148
-; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:152
-; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:156
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v33, off offset:182
+; ALIGNED-NEXT: global_store_byte v[0:1], v33, off offset:180
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v32, off offset:178
+; ALIGNED-NEXT: global_store_byte v[0:1], v32, off offset:176
+; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:160
+; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:164
+; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:168
+; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:156
-; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:152
-; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:148
-; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144
+; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:172
+; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:168
+; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:164
+; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v49, off offset:174
-; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:172
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v39, off offset:174
+; ALIGNED-NEXT: global_store_byte v[0:1], v39, off offset:172
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v48, off offset:170
-; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:168
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v38, off offset:170
+; ALIGNED-NEXT: global_store_byte v[0:1], v38, off offset:168
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v39, off offset:166
-; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:164
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v38, off offset:162
-; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:160
-; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96
-; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:100
-; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:104
-; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v37, off offset:166
+; ALIGNED-NEXT: global_store_byte v[0:1], v37, off offset:164
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v36, off offset:162
+; ALIGNED-NEXT: global_store_byte v[0:1], v36, off offset:160
+; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:112
+; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:116
+; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:120
+; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:124
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:108
-; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104
-; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100
-; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96
+; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124
+; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:120
+; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:116
+; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:112
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v53, off offset:158
-; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:156
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v51, off offset:158
+; ALIGNED-NEXT: global_store_byte v[0:1], v51, off offset:156
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v52, off offset:154
-; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:152
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v50, off offset:154
+; ALIGNED-NEXT: global_store_byte v[0:1], v50, off offset:152
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v51, off offset:150
-; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:148
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v50, off offset:146
-; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:144
-; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112
-; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116
-; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120
-; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124
-; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:124
-; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:120
-; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116
-; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v65, off offset:142
-; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:140
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v49, off offset:150
+; ALIGNED-NEXT: global_store_byte v[0:1], v49, off offset:148
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v48, off offset:146
+; ALIGNED-NEXT: global_store_byte v[0:1], v48, off offset:144
+; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:128
+; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132
+; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:140
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:140
+; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:136
+; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:132
+; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:128
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v55, off offset:142
+; ALIGNED-NEXT: global_store_byte v[0:1], v55, off offset:140
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v64, off offset:138
-; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:136
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v54, off offset:138
+; ALIGNED-NEXT: global_store_byte v[0:1], v54, off offset:136
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v55, off offset:134
-; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:132
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v54, off offset:130
-; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:128
-; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:192
-; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:196
-; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:200
-; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:204
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v53, off offset:134
+; ALIGNED-NEXT: global_store_byte v[0:1], v53, off offset:132
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v52, off offset:130
+; ALIGNED-NEXT: global_store_byte v[0:1], v52, off offset:128
+; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:212
+; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:216
+; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:220
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:204
-; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:200
-; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:196
-; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:220
+; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:216
+; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:212
+; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:208
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v69, off offset:126
-; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:124
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v67, off offset:126
+; ALIGNED-NEXT: global_store_byte v[0:1], v67, off offset:124
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v68, off offset:122
-; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:120
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v66, off offset:122
+; ALIGNED-NEXT: global_store_byte v[0:1], v66, off offset:120
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v67, off offset:118
-; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:116
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v66, off offset:114
-; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:112
-; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:208
-; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:212
-; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:216
-; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:220
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v65, off offset:118
+; ALIGNED-NEXT: global_store_byte v[0:1], v65, off offset:116
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v64, off offset:114
+; ALIGNED-NEXT: global_store_byte v[0:1], v64, off offset:112
+; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:224
+; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:228
+; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:232
+; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:236
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:220
-; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:216
-; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:212
-; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:236
+; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:232
+; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:228
+; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:224
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v81, off offset:110
-; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:108
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v71, off offset:110
+; ALIGNED-NEXT: global_store_byte v[0:1], v71, off offset:108
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v80, off offset:106
-; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:104
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v70, off offset:106
+; ALIGNED-NEXT: global_store_byte v[0:1], v70, off offset:104
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v71, off offset:102
-; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:100
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v70, off offset:98
-; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:96
-; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:160
-; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:164
-; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:168
-; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:172
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v69, off offset:102
+; ALIGNED-NEXT: global_store_byte v[0:1], v69, off offset:100
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v68, off offset:98
+; ALIGNED-NEXT: global_store_byte v[0:1], v68, off offset:96
+; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:176
+; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:180
+; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:184
+; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:188
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:172
-; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:168
-; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:164
-; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:160
+; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:188
+; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:184
+; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:180
+; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:176
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v85, off offset:94
-; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:92
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v83, off offset:94
+; ALIGNED-NEXT: global_store_byte v[0:1], v83, off offset:92
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v84, off offset:90
-; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:88
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v82, off offset:90
+; ALIGNED-NEXT: global_store_byte v[0:1], v82, off offset:88
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v83, off offset:86
-; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:84
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v82, off offset:82
-; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:80
-; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:176
-; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:180
-; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:184
-; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:188
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v81, off offset:86
+; ALIGNED-NEXT: global_store_byte v[0:1], v81, off offset:84
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v80, off offset:82
+; ALIGNED-NEXT: global_store_byte v[0:1], v80, off offset:80
+; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:196
+; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:204
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:188
-; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:184
-; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:180
-; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:176
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v19
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v18
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v25
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v24
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24
+; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:204
+; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:196
+; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:192
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v97, off offset:78
-; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:76
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v87, off offset:78
+; ALIGNED-NEXT: global_store_byte v[0:1], v87, off offset:76
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v96, off offset:74
-; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:72
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v86, off offset:74
+; ALIGNED-NEXT: global_store_byte v[0:1], v86, off offset:72
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v87, off offset:70
-; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:68
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v86, off offset:66
-; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:64
-; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:256
-; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:260
-; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:264
-; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:268
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v85, off offset:70
+; ALIGNED-NEXT: global_store_byte v[0:1], v85, off offset:68
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v84, off offset:66
+; ALIGNED-NEXT: global_store_byte v[0:1], v84, off offset:64
+; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:272
+; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:280
+; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:284
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:268
-; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:264
-; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:260
-; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:256
+; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:284
+; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:280
+; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:272
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v101, off offset:62
-; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:60
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v99, off offset:62
+; ALIGNED-NEXT: global_store_byte v[0:1], v99, off offset:60
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v100, off offset:58
-; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:56
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v98, off offset:58
+; ALIGNED-NEXT: global_store_byte v[0:1], v98, off offset:56
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v99, off offset:54
-; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:52
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v98, off offset:50
-; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:48
-; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:272
-; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276
-; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280
-; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v97, off offset:54
+; ALIGNED-NEXT: global_store_byte v[0:1], v97, off offset:52
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v96, off offset:50
+; ALIGNED-NEXT: global_store_byte v[0:1], v96, off offset:48
+; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:288
+; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292
+; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296
+; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:300
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280
-; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284
-; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272
-; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:296
+; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:300
+; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:288
+; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v15, off offset:42
-; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:40
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v15, off offset:42
+; ALIGNED-NEXT: global_store_byte v[0:1], v15, off offset:40
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v14, off offset:46
-; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:44
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v14, off offset:46
+; ALIGNED-NEXT: global_store_byte v[0:1], v14, off offset:44
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v13, off offset:34
-; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:32
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v12, off offset:38
-; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:36
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224
-; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228
-; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232
-; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v13, off offset:34
+; ALIGNED-NEXT: global_store_byte v[0:1], v13, off offset:32
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v12, off offset:38
+; ALIGNED-NEXT: global_store_byte v[0:1], v12, off offset:36
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240
+; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:244
+; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236
-; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232
-; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228
-; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224
+; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252
+; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:244
+; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v11, off offset:30
-; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:28
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v11, off offset:30
+; ALIGNED-NEXT: global_store_byte v[0:1], v11, off offset:28
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v10, off offset:26
-; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:24
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v10, off offset:26
+; ALIGNED-NEXT: global_store_byte v[0:1], v10, off offset:24
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v9, off offset:22
-; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:20
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v8, off offset:18
-; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:16
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v9, off offset:22
+; ALIGNED-NEXT: global_store_byte v[0:1], v9, off offset:20
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v8, off offset:18
+; ALIGNED-NEXT: global_store_byte v[0:1], v8, off offset:16
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:264
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252
-; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248
-; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244
-; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240
-; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:247
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v27
-; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:255
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v23
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23
-; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:253
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v22
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22
-; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:251
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v29
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29
-; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:249
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v28
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28
+; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268
+; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264
+; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260
+; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256
+; ALIGNED-NEXT: global_store_byte v[0:1], v100, off offset:255
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v27
; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27
-; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:245
+; ALIGNED-NEXT: global_store_byte v[0:1], v19, off offset:253
; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v26
; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26
-; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:215
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v51
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51
-; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:243
+; ALIGNED-NEXT: global_store_byte v[0:1], v101, off offset:251
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25
+; ALIGNED-NEXT: global_store_byte v[0:1], v27, off offset:221
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v50
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25
+; ALIGNED-NEXT: global_store_byte v[0:1], v18, off offset:249
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v24
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24
+; ALIGNED-NEXT: global_store_byte v[0:1], v102, off offset:247
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31
+; ALIGNED-NEXT: global_store_byte v[0:1], v17, off offset:245
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 24, v30
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30
+; ALIGNED-NEXT: global_store_byte v[0:1], v103, off offset:243
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v29
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29
+; ALIGNED-NEXT: global_store_byte v[0:1], v16, off offset:241
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 24, v28
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28
+; ALIGNED-NEXT: global_store_byte v[0:1], v112, off offset:239
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v35
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35
+; ALIGNED-NEXT: global_store_byte v[0:1], v23, off offset:237
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v34
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34
+; ALIGNED-NEXT: global_store_byte v[0:1], v113, off offset:235
; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33
; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33
-; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:241
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v32
+; ALIGNED-NEXT: global_store_byte v[0:1], v22, off offset:233
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v32
; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32
-; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:239
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v31
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31
-; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:237
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v30
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30
-; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:235
+; ALIGNED-NEXT: global_store_byte v[0:1], v114, off offset:231
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v39
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39
+; ALIGNED-NEXT: global_store_byte v[0:1], v21, off offset:229
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v38
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38
+; ALIGNED-NEXT: global_store_byte v[0:1], v115, off offset:227
; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v37
; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37
-; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:233
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v36
+; ALIGNED-NEXT: global_store_byte v[0:1], v20, off offset:225
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v36
; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36
-; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:231
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v35
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35
-; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:229
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v34
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34
-; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:227
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v49
+; ALIGNED-NEXT: global_store_byte v[0:1], v100, off offset:223
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v51
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51
+; ALIGNED-NEXT: global_store_byte v[0:1], v19, off offset:219
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v49
; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49
-; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:225
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v48
+; ALIGNED-NEXT: global_store_byte v[0:1], v26, off offset:217
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v48
; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48
-; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:223
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v39
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39
-; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:221
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v38
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38
-; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:219
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v53
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53
-; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:217
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v52
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52
-; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:213
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v50
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50
-; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:211
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v65
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65
-; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:209
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v64
-; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:149
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v8
+; ALIGNED-NEXT: global_store_byte v[0:1], v101, off offset:215
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v55
+; ALIGNED-NEXT: global_store_byte v[0:1], v50, off offset:153
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v8
; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64
-; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:207
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55
; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55
-; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:205
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v54
+; ALIGNED-NEXT: global_store_byte v[0:1], v25, off offset:213
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v54
; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54
-; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:203
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v69
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69
-; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:201
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v68
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68
-; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:199
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v67
+; ALIGNED-NEXT: global_store_byte v[0:1], v18, off offset:211
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v53
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53
+; ALIGNED-NEXT: global_store_byte v[0:1], v24, off offset:209
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v52
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52
+; ALIGNED-NEXT: global_store_byte v[0:1], v102, off offset:207
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v67
; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67
-; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:197
+; ALIGNED-NEXT: global_store_byte v[0:1], v31, off offset:205
; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v66
; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66
-; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:195
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v81
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81
-; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:193
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v80
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80
-; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:191
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v71
+; ALIGNED-NEXT: global_store_byte v[0:1], v17, off offset:203
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 24, v65
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65
+; ALIGNED-NEXT: global_store_byte v[0:1], v30, off offset:201
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v64
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64
+; ALIGNED-NEXT: global_store_byte v[0:1], v103, off offset:199
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71
; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71
-; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:189
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v70
+; ALIGNED-NEXT: global_store_byte v[0:1], v29, off offset:197
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v70
; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70
-; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:187
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v85
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85
-; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:185
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v84
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84
-; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:183
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v83
+; ALIGNED-NEXT: global_store_byte v[0:1], v16, off offset:195
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 24, v69
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69
+; ALIGNED-NEXT: global_store_byte v[0:1], v28, off offset:193
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v68
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68
+; ALIGNED-NEXT: global_store_byte v[0:1], v112, off offset:191
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v83
; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83
-; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:181
+; ALIGNED-NEXT: global_store_byte v[0:1], v35, off offset:189
; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 24, v82
; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82
-; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:179
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v97
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97
-; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:177
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v96
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96
-; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:175
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v87
+; ALIGNED-NEXT: global_store_byte v[0:1], v23, off offset:187
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v81
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81
+; ALIGNED-NEXT: global_store_byte v[0:1], v34, off offset:185
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v80
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80
+; ALIGNED-NEXT: global_store_byte v[0:1], v113, off offset:183
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87
; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87
-; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:173
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v86
+; ALIGNED-NEXT: global_store_byte v[0:1], v33, off offset:181
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v86
; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86
-; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:171
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v101
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101
-; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:169
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v100
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100
-; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:167
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v99
+; ALIGNED-NEXT: global_store_byte v[0:1], v22, off offset:179
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v85
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85
+; ALIGNED-NEXT: global_store_byte v[0:1], v32, off offset:177
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v84
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84
+; ALIGNED-NEXT: global_store_byte v[0:1], v114, off offset:175
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v99
; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99
-; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:165
+; ALIGNED-NEXT: global_store_byte v[0:1], v39, off offset:173
; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v98
; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98
-; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:163
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v15
+; ALIGNED-NEXT: global_store_byte v[0:1], v21, off offset:171
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v97
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97
+; ALIGNED-NEXT: global_store_byte v[0:1], v38, off offset:169
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v96
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96
+; ALIGNED-NEXT: global_store_byte v[0:1], v115, off offset:167
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v15
; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15
-; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:161
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v14
+; ALIGNED-NEXT: global_store_byte v[0:1], v37, off offset:165
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v14
; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14
-; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:159
+; ALIGNED-NEXT: global_store_byte v[0:1], v20, off offset:163
; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v13
; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13
-; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:157
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v12
+; ALIGNED-NEXT: global_store_byte v[0:1], v36, off offset:161
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v12
; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12
-; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:155
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11
+; ALIGNED-NEXT: global_store_byte v[0:1], v100, off offset:159
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v11
; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11
-; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:153
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v10
+; ALIGNED-NEXT: global_store_byte v[0:1], v51, off offset:157
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v10
; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10
-; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:151
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9
+; ALIGNED-NEXT: global_store_byte v[0:1], v27, off offset:155
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v9
; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9
-; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:147
-; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:145
-; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:143
-; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:141
-; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:139
-; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:137
-; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:135
-; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:133
-; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:131
-; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:129
-; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:127
-; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:125
-; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:123
-; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:121
-; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:119
-; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:117
-; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:115
-; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:113
-; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:111
-; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:109
-; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:107
-; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:105
-; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:103
-; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:101
-; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:99
-; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:97
-; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:95
-; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:93
-; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:91
-; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:89
-; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:87
-; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:85
-; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:83
-; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:81
-; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:79
-; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:77
-; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:75
-; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:73
-; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:71
-; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:69
-; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:67
-; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:65
-; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:63
-; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:61
-; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:59
-; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:57
-; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:55
-; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:53
-; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:51
-; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:49
-; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:43
-; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:41
-; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:47
-; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:45
-; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:35
-; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:33
-; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:39
-; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:37
-; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:31
-; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:29
-; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:27
-; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:25
-; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:23
-; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:21
-; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:19
-; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:17
+; ALIGNED-NEXT: global_store_byte v[0:1], v19, off offset:151
+; ALIGNED-NEXT: global_store_byte v[0:1], v49, off offset:149
+; ALIGNED-NEXT: global_store_byte v[0:1], v26, off offset:147
+; ALIGNED-NEXT: global_store_byte v[0:1], v48, off offset:145
+; ALIGNED-NEXT: global_store_byte v[0:1], v101, off offset:143
+; ALIGNED-NEXT: global_store_byte v[0:1], v55, off offset:141
+; ALIGNED-NEXT: global_store_byte v[0:1], v25, off offset:139
+; ALIGNED-NEXT: global_store_byte v[0:1], v54, off offset:137
+; ALIGNED-NEXT: global_store_byte v[0:1], v18, off offset:135
+; ALIGNED-NEXT: global_store_byte v[0:1], v53, off offset:133
+; ALIGNED-NEXT: global_store_byte v[0:1], v24, off offset:131
+; ALIGNED-NEXT: global_store_byte v[0:1], v52, off offset:129
+; ALIGNED-NEXT: global_store_byte v[0:1], v102, off offset:127
+; ALIGNED-NEXT: global_store_byte v[0:1], v67, off offset:125
+; ALIGNED-NEXT: global_store_byte v[0:1], v31, off offset:123
+; ALIGNED-NEXT: global_store_byte v[0:1], v66, off offset:121
+; ALIGNED-NEXT: global_store_byte v[0:1], v17, off offset:119
+; ALIGNED-NEXT: global_store_byte v[0:1], v65, off offset:117
+; ALIGNED-NEXT: global_store_byte v[0:1], v30, off offset:115
+; ALIGNED-NEXT: global_store_byte v[0:1], v64, off offset:113
+; ALIGNED-NEXT: global_store_byte v[0:1], v103, off offset:111
+; ALIGNED-NEXT: global_store_byte v[0:1], v71, off offset:109
+; ALIGNED-NEXT: global_store_byte v[0:1], v29, off offset:107
+; ALIGNED-NEXT: global_store_byte v[0:1], v70, off offset:105
+; ALIGNED-NEXT: global_store_byte v[0:1], v16, off offset:103
+; ALIGNED-NEXT: global_store_byte v[0:1], v69, off offset:101
+; ALIGNED-NEXT: global_store_byte v[0:1], v28, off offset:99
+; ALIGNED-NEXT: global_store_byte v[0:1], v68, off offset:97
+; ALIGNED-NEXT: global_store_byte v[0:1], v112, off offset:95
+; ALIGNED-NEXT: global_store_byte v[0:1], v83, off offset:93
+; ALIGNED-NEXT: global_store_byte v[0:1], v35, off offset:91
+; ALIGNED-NEXT: global_store_byte v[0:1], v82, off offset:89
+; ALIGNED-NEXT: global_store_byte v[0:1], v23, off offset:87
+; ALIGNED-NEXT: global_store_byte v[0:1], v81, off offset:85
+; ALIGNED-NEXT: global_store_byte v[0:1], v34, off offset:83
+; ALIGNED-NEXT: global_store_byte v[0:1], v80, off offset:81
+; ALIGNED-NEXT: global_store_byte v[0:1], v113, off offset:79
+; ALIGNED-NEXT: global_store_byte v[0:1], v87, off offset:77
+; ALIGNED-NEXT: global_store_byte v[0:1], v33, off offset:75
+; ALIGNED-NEXT: global_store_byte v[0:1], v86, off offset:73
+; ALIGNED-NEXT: global_store_byte v[0:1], v22, off offset:71
+; ALIGNED-NEXT: global_store_byte v[0:1], v85, off offset:69
+; ALIGNED-NEXT: global_store_byte v[0:1], v32, off offset:67
+; ALIGNED-NEXT: global_store_byte v[0:1], v84, off offset:65
+; ALIGNED-NEXT: global_store_byte v[0:1], v114, off offset:63
+; ALIGNED-NEXT: global_store_byte v[0:1], v99, off offset:61
+; ALIGNED-NEXT: global_store_byte v[0:1], v39, off offset:59
+; ALIGNED-NEXT: global_store_byte v[0:1], v98, off offset:57
+; ALIGNED-NEXT: global_store_byte v[0:1], v21, off offset:55
+; ALIGNED-NEXT: global_store_byte v[0:1], v97, off offset:53
+; ALIGNED-NEXT: global_store_byte v[0:1], v38, off offset:51
+; ALIGNED-NEXT: global_store_byte v[0:1], v96, off offset:49
+; ALIGNED-NEXT: global_store_byte v[0:1], v115, off offset:43
+; ALIGNED-NEXT: global_store_byte v[0:1], v15, off offset:41
+; ALIGNED-NEXT: global_store_byte v[0:1], v37, off offset:47
+; ALIGNED-NEXT: global_store_byte v[0:1], v14, off offset:45
+; ALIGNED-NEXT: global_store_byte v[0:1], v20, off offset:35
+; ALIGNED-NEXT: global_store_byte v[0:1], v13, off offset:33
+; ALIGNED-NEXT: global_store_byte v[0:1], v36, off offset:39
+; ALIGNED-NEXT: global_store_byte v[0:1], v12, off offset:37
+; ALIGNED-NEXT: global_store_byte v[0:1], v100, off offset:31
+; ALIGNED-NEXT: global_store_byte v[0:1], v11, off offset:29
+; ALIGNED-NEXT: global_store_byte v[0:1], v51, off offset:27
+; ALIGNED-NEXT: global_store_byte v[0:1], v10, off offset:25
+; ALIGNED-NEXT: global_store_byte v[0:1], v27, off offset:23
+; ALIGNED-NEXT: global_store_byte v[0:1], v9, off offset:21
+; ALIGNED-NEXT: global_store_byte v[0:1], v50, off offset:19
+; ALIGNED-NEXT: global_store_byte v[0:1], v8, off offset:17
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v7, off offset:14
-; ALIGNED-NEXT: global_store_byte v[16:17], v7, off offset:12
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v7, off offset:14
+; ALIGNED-NEXT: global_store_byte v[0:1], v7, off offset:12
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v6, off offset:10
-; ALIGNED-NEXT: global_store_byte v[16:17], v6, off offset:8
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v6, off offset:10
+; ALIGNED-NEXT: global_store_byte v[0:1], v6, off offset:8
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v5, off offset:6
-; ALIGNED-NEXT: global_store_byte v[16:17], v5, off offset:4
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v5, off offset:6
+; ALIGNED-NEXT: global_store_byte v[0:1], v5, off offset:4
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v4, off offset:2
-; ALIGNED-NEXT: global_store_byte v[16:17], v4, off
+; ALIGNED-NEXT: global_store_byte_d16_hi v[0:1], v4, off offset:2
+; ALIGNED-NEXT: global_store_byte v[0:1], v4, off
; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7
; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7
; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6
@@ -7752,376 +7754,362 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5
; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4
; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4
-; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:15
-; ALIGNED-NEXT: global_store_byte v[16:17], v7, off offset:13
-; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:11
-; ALIGNED-NEXT: global_store_byte v[16:17], v6, off offset:9
-; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:7
-; ALIGNED-NEXT: global_store_byte v[16:17], v5, off offset:5
-; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:3
-; ALIGNED-NEXT: global_store_byte v[16:17], v4, off offset:1
+; ALIGNED-NEXT: global_store_byte v[0:1], v8, off offset:15
+; ALIGNED-NEXT: global_store_byte v[0:1], v7, off offset:13
+; ALIGNED-NEXT: global_store_byte v[0:1], v9, off offset:11
+; ALIGNED-NEXT: global_store_byte v[0:1], v6, off offset:9
+; ALIGNED-NEXT: global_store_byte v[0:1], v10, off offset:7
+; ALIGNED-NEXT: global_store_byte v[0:1], v5, off offset:5
+; ALIGNED-NEXT: global_store_byte v[0:1], v11, off offset:3
+; ALIGNED-NEXT: global_store_byte v[0:1], v4, off offset:1
+; ALIGNED-NEXT: v_add_co_u32 v0, vcc_lo, 0x100, v0
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; ALIGNED-NEXT: s_cbranch_scc1 .LBB6_2
-; ALIGNED-NEXT: .LBB6_3: ; %Flow7
-; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6
+; ALIGNED-NEXT: .LBB6_3: ; %Flow17
+; ALIGNED-NEXT: s_andn2_saveexec_b32 s6, s6
; ALIGNED-NEXT: s_cbranch_execz .LBB6_6
; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader
-; ALIGNED-NEXT: s_movk_i32 s6, 0xff00
-; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x700
-; ALIGNED-NEXT: s_mov_b32 s7, -1
+; ALIGNED-NEXT: v_add_co_u32 v12, vcc_lo, 0x700, v2
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v3, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v14, vcc_lo, 0x700, v0
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v1, vcc_lo
+; ALIGNED-NEXT: s_movk_i32 s4, 0xf800
+; ALIGNED-NEXT: s_mov_b32 s5, -1
; ALIGNED-NEXT: .LBB6_5: ; %memmove_bwd_loop
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
-; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo
; ALIGNED-NEXT: s_clause 0xf
-; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[24:25], off offset:240
-; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[24:25], off offset:224
-; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[24:25], off
-; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[24:25], off offset:16
-; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[24:25], off offset:32
-; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[24:25], off offset:48
-; ALIGNED-NEXT: global_load_dwordx4 v[116:119], v[24:25], off offset:64
-; ALIGNED-NEXT: global_load_dwordx4 v[40:43], v[24:25], off offset:80
-; ALIGNED-NEXT: global_load_dwordx4 v[26:29], v[24:25], off offset:96
-; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[24:25], off offset:112
-; ALIGNED-NEXT: global_load_dwordx4 v[44:47], v[24:25], off offset:128
-; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[24:25], off offset:144
-; ALIGNED-NEXT: global_load_dwordx4 v[66:69], v[24:25], off offset:160
-; ALIGNED-NEXT: global_load_dwordx4 v[81:84], v[24:25], off offset:176
-; ALIGNED-NEXT: global_load_dwordx4 v[96:99], v[24:25], off offset:192
-; ALIGNED-NEXT: global_load_dwordx4 v[100:103], v[24:25], off offset:208
+; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[12:13], off offset:240
+; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[12:13], off offset:224
+; ALIGNED-NEXT: global_load_dwordx4 v[0:3], v[12:13], off
+; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16
+; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:32
+; ALIGNED-NEXT: global_load_dwordx4 v[100:103], v[12:13], off offset:48
+; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[12:13], off offset:64
+; ALIGNED-NEXT: global_load_dwordx4 v[116:119], v[12:13], off offset:80
+; ALIGNED-NEXT: global_load_dwordx4 v[40:43], v[12:13], off offset:96
+; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[12:13], off offset:112
+; ALIGNED-NEXT: global_load_dwordx4 v[34:37], v[12:13], off offset:128
+; ALIGNED-NEXT: global_load_dwordx4 v[44:47], v[12:13], off offset:144
+; ALIGNED-NEXT: global_load_dwordx4 v[54:57], v[12:13], off offset:160
+; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[12:13], off offset:176
+; ALIGNED-NEXT: global_load_dwordx4 v[82:85], v[12:13], off offset:192
+; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[12:13], off offset:208
+; ALIGNED-NEXT: v_add_co_u32 v12, vcc_lo, 0xffffff00, v12
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v13, null, -1, v13, vcc_lo
+; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100
+; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
; ALIGNED-NEXT: s_waitcnt vmcnt(15)
-; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320
-; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:324
-; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328
-; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332
+; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:336
+; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:340
+; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:344
+; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:348
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332
-; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:328
-; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324
-; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:320
-; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
-; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00
-; ALIGNED-NEXT: s_addc_u32 s5, s5, -1
+; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:348
+; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:344
+; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:340
+; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:336
+; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], 0
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v31, off offset:254
-; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:252
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v33, off offset:254
+; ALIGNED-NEXT: global_store_byte v[14:15], v33, off offset:252
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v30, off offset:250
-; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:248
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v32, off offset:250
+; ALIGNED-NEXT: global_store_byte v[14:15], v32, off offset:248
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v25, off offset:246
-; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:244
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v24, off offset:242
-; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:240
-; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:336
-; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:340
-; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:344
-; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:348
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v27, off offset:246
+; ALIGNED-NEXT: global_store_byte v[14:15], v27, off offset:244
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v26, off offset:242
+; ALIGNED-NEXT: global_store_byte v[14:15], v26, off offset:240
+; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:352
+; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:356
+; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:360
+; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:364
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:348
-; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:344
-; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:340
-; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:336
-; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7]
+; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:364
+; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:360
+; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:356
+; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:352
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v51, off offset:238
-; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:236
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v53, off offset:238
+; ALIGNED-NEXT: global_store_byte v[14:15], v53, off offset:236
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v50, off offset:234
-; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:232
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v52, off offset:234
+; ALIGNED-NEXT: global_store_byte v[14:15], v52, off offset:232
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v49, off offset:230
-; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:228
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v36, off offset:226
-; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:224
-; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:288
-; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:292
-; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:296
-; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:300
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v51, off offset:230
+; ALIGNED-NEXT: global_store_byte v[14:15], v51, off offset:228
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v50, off offset:226
+; ALIGNED-NEXT: global_store_byte v[14:15], v50, off offset:224
+; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:304
+; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:308
+; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:312
+; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:316
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:300
-; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:296
-; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:292
-; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:288
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v30
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30
+; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:316
+; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:312
+; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:308
+; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:304
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v71, off offset:222
-; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:220
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v81, off offset:222
+; ALIGNED-NEXT: global_store_byte v[14:15], v81, off offset:220
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v70, off offset:218
-; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:216
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v80, off offset:218
+; ALIGNED-NEXT: global_store_byte v[14:15], v80, off offset:216
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v65, off offset:214
-; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:212
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v64, off offset:210
-; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:208
-; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:304
-; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:308
-; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:312
-; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:316
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v67, off offset:214
+; ALIGNED-NEXT: global_store_byte v[14:15], v67, off offset:212
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v66, off offset:210
+; ALIGNED-NEXT: global_store_byte v[14:15], v66, off offset:208
+; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:320
+; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:324
+; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:328
+; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:332
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:316
-; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:312
-; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:308
-; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:304
+; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:332
+; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:328
+; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:324
+; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:320
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v87, off offset:206
-; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:204
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v98, off offset:206
+; ALIGNED-NEXT: global_store_byte v[14:15], v98, off offset:204
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v86, off offset:202
-; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:200
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v87, off offset:202
+; ALIGNED-NEXT: global_store_byte v[14:15], v87, off offset:200
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v85, off offset:198
-; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:196
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v86, off offset:198
+; ALIGNED-NEXT: global_store_byte v[14:15], v86, off offset:196
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v80, off offset:194
-; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:192
-; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:384
-; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:388
-; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:392
-; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:396
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v82, off offset:194
+; ALIGNED-NEXT: global_store_byte v[14:15], v82, off offset:192
+; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:400
+; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:404
+; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:408
+; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:412
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:396
-; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:392
-; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:388
-; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:384
+; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:412
+; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:408
+; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:404
+; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:400
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v101, off offset:190
-; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:188
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v99, off offset:190
+; ALIGNED-NEXT: global_store_byte v[14:15], v99, off offset:188
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v99, off offset:186
-; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:184
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v96, off offset:186
+; ALIGNED-NEXT: global_store_byte v[14:15], v96, off offset:184
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v96, off offset:182
-; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:180
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v81, off offset:178
-; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:176
-; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:400
-; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:404
-; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:408
-; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:412
-; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:412
-; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:408
-; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:404
-; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:400
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v100, off offset:174
-; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:172
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v97, off offset:170
-; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:168
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v82, off offset:166
-; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:164
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v66, off offset:162
-; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:160
-; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:352
-; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:356
-; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:360
-; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:364
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v83, off offset:182
+; ALIGNED-NEXT: global_store_byte v[14:15], v83, off offset:180
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v68, off offset:178
+; ALIGNED-NEXT: global_store_byte v[14:15], v68, off offset:176
+; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:416
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:420
+; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424
+; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:428
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:364
-; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:360
-; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:356
-; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:352
+; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:428
+; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:424
+; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:420
+; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:416
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v98, off offset:158
-; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:156
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v97, off offset:174
+; ALIGNED-NEXT: global_store_byte v[14:15], v97, off offset:172
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v83, off offset:154
-; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:152
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v84, off offset:170
+; ALIGNED-NEXT: global_store_byte v[14:15], v84, off offset:168
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v67, off offset:150
-; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:148
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v69, off offset:166
+; ALIGNED-NEXT: global_store_byte v[14:15], v69, off offset:164
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v52, off offset:146
-; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:144
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v54, off offset:162
+; ALIGNED-NEXT: global_store_byte v[14:15], v54, off offset:160
; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:368
; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:372
; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:376
; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:380
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:380
-; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:376
-; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:372
-; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:368
+; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:380
+; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:376
+; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:372
+; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:368
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v84, off offset:142
-; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:140
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v85, off offset:158
+; ALIGNED-NEXT: global_store_byte v[14:15], v85, off offset:156
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v68, off offset:138
-; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:136
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v70, off offset:154
+; ALIGNED-NEXT: global_store_byte v[14:15], v70, off offset:152
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v53, off offset:134
-; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:132
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v37, off offset:130
-; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:128
-; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:448
-; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:452
-; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:456
-; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:460
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v55, off offset:150
+; ALIGNED-NEXT: global_store_byte v[14:15], v55, off offset:148
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v38, off offset:146
+; ALIGNED-NEXT: global_store_byte v[14:15], v38, off offset:144
+; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:384
+; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:388
+; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:392
+; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:396
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:460
-; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:456
-; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:452
-; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:448
+; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:396
+; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:392
+; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:388
+; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:384
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v69, off offset:126
-; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:124
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v71, off offset:142
+; ALIGNED-NEXT: global_store_byte v[14:15], v71, off offset:140
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v54, off offset:122
-; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:120
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v64, off offset:138
+; ALIGNED-NEXT: global_store_byte v[14:15], v64, off offset:136
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v38, off offset:118
-; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:116
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v32, off offset:114
-; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:112
-; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:468
-; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:472
-; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:476
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v39, off offset:134
+; ALIGNED-NEXT: global_store_byte v[14:15], v39, off offset:132
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v34, off offset:130
+; ALIGNED-NEXT: global_store_byte v[14:15], v34, off offset:128
+; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:464
+; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:468
+; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:472
+; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:476
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:476
-; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:472
-; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:468
-; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:464
+; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:476
+; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:472
+; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:468
+; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:464
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v55, off offset:110
-; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:108
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v65, off offset:126
+; ALIGNED-NEXT: global_store_byte v[14:15], v65, off offset:124
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v39, off offset:106
-; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:104
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v48, off offset:122
+; ALIGNED-NEXT: global_store_byte v[14:15], v48, off offset:120
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v33, off offset:102
-; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:100
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v26, off offset:98
-; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:96
-; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:416
-; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:420
-; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:424
-; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:428
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v35, off offset:118
+; ALIGNED-NEXT: global_store_byte v[14:15], v35, off offset:116
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v28, off offset:114
+; ALIGNED-NEXT: global_store_byte v[14:15], v28, off offset:112
+; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:480
+; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:484
+; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:488
+; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:492
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:428
-; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:424
-; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:420
-; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:416
+; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:492
+; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:488
+; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:484
+; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:480
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v48, off offset:94
-; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:92
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v49, off offset:110
+; ALIGNED-NEXT: global_store_byte v[14:15], v49, off offset:108
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v34, off offset:90
-; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:88
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v36, off offset:106
+; ALIGNED-NEXT: global_store_byte v[14:15], v36, off offset:104
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v27, off offset:86
-; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:84
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v29, off offset:102
+; ALIGNED-NEXT: global_store_byte v[14:15], v29, off offset:100
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v21, off offset:82
-; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:80
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v22, off offset:98
+; ALIGNED-NEXT: global_store_byte v[14:15], v22, off offset:96
; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:432
; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:436
; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:440
; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:444
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:444
-; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:440
-; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:436
+; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:444
+; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:440
+; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:436
; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:432
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v35, off offset:78
-; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:76
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v37, off offset:94
+; ALIGNED-NEXT: global_store_byte v[14:15], v37, off offset:92
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v28, off offset:74
-; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:72
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v30, off offset:90
+; ALIGNED-NEXT: global_store_byte v[14:15], v30, off offset:88
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v22, off offset:70
-; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:68
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v19, off offset:66
-; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:64
-; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:512
-; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:516
-; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:520
-; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:524
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v23, off offset:86
+; ALIGNED-NEXT: global_store_byte v[14:15], v23, off offset:84
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v19, off offset:82
+; ALIGNED-NEXT: global_store_byte v[14:15], v19, off offset:80
+; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:448
+; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:452
+; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:456
+; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:460
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:524
-; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:520
-; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:516
-; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:512
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v25
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v24
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24
+; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:460
+; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:456
+; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:452
+; ALIGNED-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:448
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v53
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v52
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52
; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51
; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51
; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50
; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v29, off offset:62
-; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:60
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v31, off offset:78
+; ALIGNED-NEXT: global_store_byte v[14:15], v31, off offset:76
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v23, off offset:58
-; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:56
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v24, off offset:74
+; ALIGNED-NEXT: global_store_byte v[14:15], v24, off offset:72
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v20, off offset:54
-; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:52
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v18, off offset:50
-; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:48
-; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:528
-; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:532
-; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:536
-; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:540
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v20, off offset:70
+; ALIGNED-NEXT: global_store_byte v[14:15], v20, off offset:68
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v17, off offset:66
+; ALIGNED-NEXT: global_store_byte v[14:15], v17, off offset:64
+; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:528
+; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:532
+; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:536
+; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:540
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:536
-; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:540
-; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:528
-; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:532
+; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:540
+; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536
+; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:532
+; ALIGNED-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v33
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v32
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v27
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v26
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v15, off offset:42
-; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:40
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v25, off offset:62
+; ALIGNED-NEXT: global_store_byte v[14:15], v25, off offset:60
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v14, off offset:46
-; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:44
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v21, off offset:58
+; ALIGNED-NEXT: global_store_byte v[14:15], v21, off offset:56
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v13, off offset:34
-; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:32
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v12, off offset:38
-; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:36
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480
-; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:484
-; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488
-; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:492
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v18, off offset:54
+; ALIGNED-NEXT: global_store_byte v[14:15], v18, off offset:52
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v16, off offset:50
+; ALIGNED-NEXT: global_store_byte v[14:15], v16, off offset:48
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:544
+; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:548
+; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:552
+; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:556
; ALIGNED-NEXT: s_clause 0x3
-; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492
-; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:488
-; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:484
-; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:480
+; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:552
+; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:556
+; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:544
+; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:548
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v11, off offset:30
-; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:28
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v11, off offset:42
+; ALIGNED-NEXT: global_store_byte v[14:15], v11, off offset:40
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v10, off offset:26
-; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:24
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v10, off offset:46
+; ALIGNED-NEXT: global_store_byte v[14:15], v10, off offset:44
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v9, off offset:22
-; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:20
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v9, off offset:34
+; ALIGNED-NEXT: global_store_byte v[14:15], v9, off offset:32
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v8, off offset:18
-; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:16
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v8, off offset:38
+; ALIGNED-NEXT: global_store_byte v[14:15], v8, off offset:36
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:500
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504
@@ -8131,274 +8119,295 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:504
; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500
; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496
-; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:247
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v65
-; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:255
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v49
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49
-; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:253
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v36
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36
-; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:251
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71
-; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:249
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v70
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v7, off offset:30
+; ALIGNED-NEXT: global_store_byte v[14:15], v7, off offset:28
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v6, off offset:26
+; ALIGNED-NEXT: global_store_byte v[14:15], v6, off offset:24
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v5, off offset:22
+; ALIGNED-NEXT: global_store_byte v[14:15], v5, off offset:20
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v4, off offset:18
+; ALIGNED-NEXT: global_store_byte v[14:15], v4, off offset:16
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:524
+; ALIGNED-NEXT: s_clause 0x3
+; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524
+; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520
+; ALIGNED-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512
+; ALIGNED-NEXT: global_store_byte v[14:15], v100, off offset:255
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v81
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81
+; ALIGNED-NEXT: global_store_byte v[14:15], v33, off offset:253
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v80
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80
+; ALIGNED-NEXT: global_store_byte v[14:15], v101, off offset:251
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v67
+; ALIGNED-NEXT: global_store_byte v[14:15], v81, off offset:221
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v70
; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65
-; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:245
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v64
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64
-; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:215
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v67
; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67
-; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:243
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87
+; ALIGNED-NEXT: global_store_byte v[14:15], v32, off offset:249
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v66
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66
+; ALIGNED-NEXT: global_store_byte v[14:15], v102, off offset:247
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v98
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98
+; ALIGNED-NEXT: global_store_byte v[14:15], v27, off offset:245
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v87
; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87
-; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:241
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v86
+; ALIGNED-NEXT: global_store_byte v[14:15], v103, off offset:243
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v86
; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86
-; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:239
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v85
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85
-; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:237
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v80
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80
-; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:235
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v101
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101
-; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:233
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v99
+; ALIGNED-NEXT: global_store_byte v[14:15], v26, off offset:241
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v82
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82
+; ALIGNED-NEXT: global_store_byte v[14:15], v112, off offset:239
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v99
; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99
-; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:231
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v96
+; ALIGNED-NEXT: global_store_byte v[14:15], v53, off offset:237
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v96
; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96
-; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:229
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v81
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81
-; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:227
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v100
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100
-; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:225
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v97
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97
-; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:223
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v82
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82
-; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:221
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v66
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66
-; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:219
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v98
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98
-; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:217
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v83
+; ALIGNED-NEXT: global_store_byte v[14:15], v113, off offset:235
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v83
; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83
-; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:213
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v52
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52
-; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:211
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v84
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84
-; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:209
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v68
-; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:149
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v8
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8
+; ALIGNED-NEXT: global_store_byte v[14:15], v52, off offset:233
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v68
; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68
-; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:207
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53
-; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:205
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v37
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37
-; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:203
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v69
+; ALIGNED-NEXT: global_store_byte v[14:15], v114, off offset:231
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v97
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97
+; ALIGNED-NEXT: global_store_byte v[14:15], v51, off offset:229
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v84
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84
+; ALIGNED-NEXT: global_store_byte v[14:15], v115, off offset:227
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v69
; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69
-; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:201
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v54
+; ALIGNED-NEXT: global_store_byte v[14:15], v50, off offset:225
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v54
; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54
-; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:199
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38
-; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:197
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v32
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32
-; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:195
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v55
+; ALIGNED-NEXT: global_store_byte v[14:15], v100, off offset:223
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v85
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85
+; ALIGNED-NEXT: global_store_byte v[14:15], v33, off offset:219
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v55
; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55
-; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:193
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v39
+; ALIGNED-NEXT: global_store_byte v[14:15], v80, off offset:217
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v38
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38
+; ALIGNED-NEXT: global_store_byte v[14:15], v101, off offset:215
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v71
+; ALIGNED-NEXT: global_store_byte v[14:15], v70, off offset:153
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v4
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71
+; ALIGNED-NEXT: global_store_byte v[14:15], v67, off offset:213
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v64
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64
+; ALIGNED-NEXT: global_store_byte v[14:15], v32, off offset:211
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v39
; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39
-; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:191
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v33
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33
-; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:189
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v26
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26
-; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:187
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v48
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48
-; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:185
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34
+; ALIGNED-NEXT: global_store_byte v[14:15], v66, off offset:209
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v34
; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34
-; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:183
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v27
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27
-; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:181
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v21
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21
-; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:179
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v35
+; ALIGNED-NEXT: global_store_byte v[14:15], v102, off offset:207
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v65
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65
+; ALIGNED-NEXT: global_store_byte v[14:15], v98, off offset:205
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v48
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48
+; ALIGNED-NEXT: global_store_byte v[14:15], v27, off offset:203
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v35
; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35
-; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:177
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v28
+; ALIGNED-NEXT: global_store_byte v[14:15], v87, off offset:201
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v28
; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28
-; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:175
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v22
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22
-; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:173
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19
-; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:171
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v29
+; ALIGNED-NEXT: global_store_byte v[14:15], v103, off offset:199
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v49
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49
+; ALIGNED-NEXT: global_store_byte v[14:15], v86, off offset:197
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v36
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36
+; ALIGNED-NEXT: global_store_byte v[14:15], v26, off offset:195
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v29
; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29
-; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:169
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v23
+; ALIGNED-NEXT: global_store_byte v[14:15], v82, off offset:193
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v22
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22
+; ALIGNED-NEXT: global_store_byte v[14:15], v112, off offset:191
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v37
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37
+; ALIGNED-NEXT: global_store_byte v[14:15], v99, off offset:189
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v30
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30
+; ALIGNED-NEXT: global_store_byte v[14:15], v53, off offset:187
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v23
; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23
-; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:167
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20
+; ALIGNED-NEXT: global_store_byte v[14:15], v96, off offset:185
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v19
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19
+; ALIGNED-NEXT: global_store_byte v[14:15], v113, off offset:183
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v31
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31
+; ALIGNED-NEXT: global_store_byte v[14:15], v83, off offset:181
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v24
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24
+; ALIGNED-NEXT: global_store_byte v[14:15], v52, off offset:179
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v20
; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20
-; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:165
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18
+; ALIGNED-NEXT: global_store_byte v[14:15], v68, off offset:177
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v17
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 8, v17
+; ALIGNED-NEXT: global_store_byte v[14:15], v114, off offset:175
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v25
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25
+; ALIGNED-NEXT: global_store_byte v[14:15], v97, off offset:173
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v21
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21
+; ALIGNED-NEXT: global_store_byte v[14:15], v51, off offset:171
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v18
; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18
-; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:163
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v15
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15
-; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:161
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v14
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14
-; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:159
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v13
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13
-; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:157
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v12
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12
-; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:155
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v11
+; ALIGNED-NEXT: global_store_byte v[14:15], v84, off offset:169
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v16
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 8, v16
+; ALIGNED-NEXT: global_store_byte v[14:15], v115, off offset:167
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v11
; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11
-; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:153
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v10
+; ALIGNED-NEXT: global_store_byte v[14:15], v69, off offset:165
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v10
; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10
-; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:151
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9
+; ALIGNED-NEXT: global_store_byte v[14:15], v50, off offset:163
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v9
; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9
-; ALIGNED-NEXT: global_store_byte v[16:17], v65, off offset:147
-; ALIGNED-NEXT: global_store_byte v[16:17], v52, off offset:145
-; ALIGNED-NEXT: global_store_byte v[16:17], v25, off offset:143
-; ALIGNED-NEXT: global_store_byte v[16:17], v84, off offset:141
-; ALIGNED-NEXT: global_store_byte v[16:17], v64, off offset:139
-; ALIGNED-NEXT: global_store_byte v[16:17], v68, off offset:137
-; ALIGNED-NEXT: global_store_byte v[16:17], v113, off offset:135
-; ALIGNED-NEXT: global_store_byte v[16:17], v53, off offset:133
-; ALIGNED-NEXT: global_store_byte v[16:17], v87, off offset:131
-; ALIGNED-NEXT: global_store_byte v[16:17], v37, off offset:129
-; ALIGNED-NEXT: global_store_byte v[16:17], v24, off offset:127
-; ALIGNED-NEXT: global_store_byte v[16:17], v69, off offset:125
-; ALIGNED-NEXT: global_store_byte v[16:17], v86, off offset:123
-; ALIGNED-NEXT: global_store_byte v[16:17], v54, off offset:121
-; ALIGNED-NEXT: global_store_byte v[16:17], v114, off offset:119
-; ALIGNED-NEXT: global_store_byte v[16:17], v38, off offset:117
-; ALIGNED-NEXT: global_store_byte v[16:17], v85, off offset:115
-; ALIGNED-NEXT: global_store_byte v[16:17], v32, off offset:113
-; ALIGNED-NEXT: global_store_byte v[16:17], v51, off offset:111
-; ALIGNED-NEXT: global_store_byte v[16:17], v55, off offset:109
-; ALIGNED-NEXT: global_store_byte v[16:17], v80, off offset:107
-; ALIGNED-NEXT: global_store_byte v[16:17], v39, off offset:105
-; ALIGNED-NEXT: global_store_byte v[16:17], v115, off offset:103
-; ALIGNED-NEXT: global_store_byte v[16:17], v33, off offset:101
-; ALIGNED-NEXT: global_store_byte v[16:17], v101, off offset:99
-; ALIGNED-NEXT: global_store_byte v[16:17], v26, off offset:97
-; ALIGNED-NEXT: global_store_byte v[16:17], v50, off offset:95
-; ALIGNED-NEXT: global_store_byte v[16:17], v48, off offset:93
-; ALIGNED-NEXT: global_store_byte v[16:17], v99, off offset:91
-; ALIGNED-NEXT: global_store_byte v[16:17], v34, off offset:89
-; ALIGNED-NEXT: global_store_byte v[16:17], v102, off offset:87
-; ALIGNED-NEXT: global_store_byte v[16:17], v27, off offset:85
-; ALIGNED-NEXT: global_store_byte v[16:17], v96, off offset:83
-; ALIGNED-NEXT: global_store_byte v[16:17], v21, off offset:81
-; ALIGNED-NEXT: global_store_byte v[16:17], v49, off offset:79
-; ALIGNED-NEXT: global_store_byte v[16:17], v35, off offset:77
-; ALIGNED-NEXT: global_store_byte v[16:17], v81, off offset:75
-; ALIGNED-NEXT: global_store_byte v[16:17], v28, off offset:73
-; ALIGNED-NEXT: global_store_byte v[16:17], v31, off offset:71
-; ALIGNED-NEXT: global_store_byte v[16:17], v22, off offset:69
-; ALIGNED-NEXT: global_store_byte v[16:17], v100, off offset:67
-; ALIGNED-NEXT: global_store_byte v[16:17], v19, off offset:65
-; ALIGNED-NEXT: global_store_byte v[16:17], v36, off offset:63
-; ALIGNED-NEXT: global_store_byte v[16:17], v29, off offset:61
-; ALIGNED-NEXT: global_store_byte v[16:17], v97, off offset:59
-; ALIGNED-NEXT: global_store_byte v[16:17], v23, off offset:57
-; ALIGNED-NEXT: global_store_byte v[16:17], v103, off offset:55
-; ALIGNED-NEXT: global_store_byte v[16:17], v20, off offset:53
-; ALIGNED-NEXT: global_store_byte v[16:17], v82, off offset:51
-; ALIGNED-NEXT: global_store_byte v[16:17], v18, off offset:49
-; ALIGNED-NEXT: global_store_byte v[16:17], v71, off offset:43
-; ALIGNED-NEXT: global_store_byte v[16:17], v15, off offset:41
-; ALIGNED-NEXT: global_store_byte v[16:17], v66, off offset:47
-; ALIGNED-NEXT: global_store_byte v[16:17], v14, off offset:45
-; ALIGNED-NEXT: global_store_byte v[16:17], v30, off offset:35
-; ALIGNED-NEXT: global_store_byte v[16:17], v13, off offset:33
-; ALIGNED-NEXT: global_store_byte v[16:17], v98, off offset:39
-; ALIGNED-NEXT: global_store_byte v[16:17], v12, off offset:37
-; ALIGNED-NEXT: global_store_byte v[16:17], v70, off offset:31
-; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:29
-; ALIGNED-NEXT: global_store_byte v[16:17], v83, off offset:27
-; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:25
-; ALIGNED-NEXT: global_store_byte v[16:17], v112, off offset:23
-; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:21
-; ALIGNED-NEXT: global_store_byte v[16:17], v67, off offset:19
-; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:17
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v7, off offset:14
-; ALIGNED-NEXT: global_store_byte v[16:17], v7, off offset:12
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v6, off offset:10
-; ALIGNED-NEXT: global_store_byte v[16:17], v6, off offset:8
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v5, off offset:6
-; ALIGNED-NEXT: global_store_byte v[16:17], v5, off offset:4
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: global_store_byte_d16_hi v[16:17], v4, off offset:2
-; ALIGNED-NEXT: global_store_byte v[16:17], v4, off
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7
+; ALIGNED-NEXT: global_store_byte v[14:15], v54, off offset:161
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v8
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8
+; ALIGNED-NEXT: global_store_byte v[14:15], v100, off offset:159
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v7
; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6
+; ALIGNED-NEXT: global_store_byte v[14:15], v85, off offset:157
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v6
; ALIGNED-NEXT: v_lshrrev_b32_e32 v6, 8, v6
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 24, v5
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4
-; ALIGNED-NEXT: global_store_byte v[16:17], v8, off offset:15
-; ALIGNED-NEXT: global_store_byte v[16:17], v7, off offset:13
-; ALIGNED-NEXT: global_store_byte v[16:17], v9, off offset:11
-; ALIGNED-NEXT: global_store_byte v[16:17], v6, off offset:9
-; ALIGNED-NEXT: global_store_byte v[16:17], v10, off offset:7
-; ALIGNED-NEXT: global_store_byte v[16:17], v5, off offset:5
-; ALIGNED-NEXT: global_store_byte v[16:17], v11, off offset:3
-; ALIGNED-NEXT: global_store_byte v[16:17], v4, off offset:1
-; ALIGNED-NEXT: s_cbranch_scc0 .LBB6_5
-; ALIGNED-NEXT: .LBB6_6: ; %Flow8
-; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8
-; ALIGNED-NEXT: s_clause 0x7 ; 32-byte Folded Reload
-; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32
-; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4
-; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8
-; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12
-; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16
-; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20
-; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24
-; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28
+; ALIGNED-NEXT: global_store_byte v[14:15], v81, off offset:155
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v5
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5
+; ALIGNED-NEXT: global_store_byte v[14:15], v33, off offset:151
+; ALIGNED-NEXT: global_store_byte v[14:15], v55, off offset:149
+; ALIGNED-NEXT: global_store_byte v[14:15], v80, off offset:147
+; ALIGNED-NEXT: global_store_byte v[14:15], v38, off offset:145
+; ALIGNED-NEXT: global_store_byte v[14:15], v101, off offset:143
+; ALIGNED-NEXT: global_store_byte v[14:15], v71, off offset:141
+; ALIGNED-NEXT: global_store_byte v[14:15], v67, off offset:139
+; ALIGNED-NEXT: global_store_byte v[14:15], v64, off offset:137
+; ALIGNED-NEXT: global_store_byte v[14:15], v32, off offset:135
+; ALIGNED-NEXT: global_store_byte v[14:15], v39, off offset:133
+; ALIGNED-NEXT: global_store_byte v[14:15], v66, off offset:131
+; ALIGNED-NEXT: global_store_byte v[14:15], v34, off offset:129
+; ALIGNED-NEXT: global_store_byte v[14:15], v102, off offset:127
+; ALIGNED-NEXT: global_store_byte v[14:15], v65, off offset:125
+; ALIGNED-NEXT: global_store_byte v[14:15], v98, off offset:123
+; ALIGNED-NEXT: global_store_byte v[14:15], v48, off offset:121
+; ALIGNED-NEXT: global_store_byte v[14:15], v27, off offset:119
+; ALIGNED-NEXT: global_store_byte v[14:15], v35, off offset:117
+; ALIGNED-NEXT: global_store_byte v[14:15], v87, off offset:115
+; ALIGNED-NEXT: global_store_byte v[14:15], v28, off offset:113
+; ALIGNED-NEXT: global_store_byte v[14:15], v103, off offset:111
+; ALIGNED-NEXT: global_store_byte v[14:15], v49, off offset:109
+; ALIGNED-NEXT: global_store_byte v[14:15], v86, off offset:107
+; ALIGNED-NEXT: global_store_byte v[14:15], v36, off offset:105
+; ALIGNED-NEXT: global_store_byte v[14:15], v26, off offset:103
+; ALIGNED-NEXT: global_store_byte v[14:15], v29, off offset:101
+; ALIGNED-NEXT: global_store_byte v[14:15], v82, off offset:99
+; ALIGNED-NEXT: global_store_byte v[14:15], v22, off offset:97
+; ALIGNED-NEXT: global_store_byte v[14:15], v112, off offset:95
+; ALIGNED-NEXT: global_store_byte v[14:15], v37, off offset:93
+; ALIGNED-NEXT: global_store_byte v[14:15], v99, off offset:91
+; ALIGNED-NEXT: global_store_byte v[14:15], v30, off offset:89
+; ALIGNED-NEXT: global_store_byte v[14:15], v53, off offset:87
+; ALIGNED-NEXT: global_store_byte v[14:15], v23, off offset:85
+; ALIGNED-NEXT: global_store_byte v[14:15], v96, off offset:83
+; ALIGNED-NEXT: global_store_byte v[14:15], v19, off offset:81
+; ALIGNED-NEXT: global_store_byte v[14:15], v113, off offset:79
+; ALIGNED-NEXT: global_store_byte v[14:15], v31, off offset:77
+; ALIGNED-NEXT: global_store_byte v[14:15], v83, off offset:75
+; ALIGNED-NEXT: global_store_byte v[14:15], v24, off offset:73
+; ALIGNED-NEXT: global_store_byte v[14:15], v52, off offset:71
+; ALIGNED-NEXT: global_store_byte v[14:15], v20, off offset:69
+; ALIGNED-NEXT: global_store_byte v[14:15], v68, off offset:67
+; ALIGNED-NEXT: global_store_byte v[14:15], v17, off offset:65
+; ALIGNED-NEXT: global_store_byte v[14:15], v114, off offset:63
+; ALIGNED-NEXT: global_store_byte v[14:15], v25, off offset:61
+; ALIGNED-NEXT: global_store_byte v[14:15], v97, off offset:59
+; ALIGNED-NEXT: global_store_byte v[14:15], v21, off offset:57
+; ALIGNED-NEXT: global_store_byte v[14:15], v51, off offset:55
+; ALIGNED-NEXT: global_store_byte v[14:15], v18, off offset:53
+; ALIGNED-NEXT: global_store_byte v[14:15], v84, off offset:51
+; ALIGNED-NEXT: global_store_byte v[14:15], v16, off offset:49
+; ALIGNED-NEXT: global_store_byte v[14:15], v115, off offset:43
+; ALIGNED-NEXT: global_store_byte v[14:15], v11, off offset:41
+; ALIGNED-NEXT: global_store_byte v[14:15], v69, off offset:47
+; ALIGNED-NEXT: global_store_byte v[14:15], v10, off offset:45
+; ALIGNED-NEXT: global_store_byte v[14:15], v50, off offset:35
+; ALIGNED-NEXT: global_store_byte v[14:15], v9, off offset:33
+; ALIGNED-NEXT: global_store_byte v[14:15], v54, off offset:39
+; ALIGNED-NEXT: global_store_byte v[14:15], v8, off offset:37
+; ALIGNED-NEXT: global_store_byte v[14:15], v100, off offset:31
+; ALIGNED-NEXT: global_store_byte v[14:15], v7, off offset:29
+; ALIGNED-NEXT: global_store_byte v[14:15], v85, off offset:27
+; ALIGNED-NEXT: global_store_byte v[14:15], v6, off offset:25
+; ALIGNED-NEXT: global_store_byte v[14:15], v81, off offset:23
+; ALIGNED-NEXT: global_store_byte v[14:15], v5, off offset:21
+; ALIGNED-NEXT: global_store_byte v[14:15], v70, off offset:19
+; ALIGNED-NEXT: global_store_byte v[14:15], v4, off offset:17
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v3, off offset:14
+; ALIGNED-NEXT: global_store_byte v[14:15], v3, off offset:12
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v2, off offset:10
+; ALIGNED-NEXT: global_store_byte v[14:15], v2, off offset:8
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v1, off offset:6
+; ALIGNED-NEXT: global_store_byte v[14:15], v1, off offset:4
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: global_store_byte_d16_hi v[14:15], v0, off offset:2
+; ALIGNED-NEXT: global_store_byte v[14:15], v0, off
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 24, v3
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v3, 8, v3
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 24, v2
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v2, 8, v2
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v6, 24, v1
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 24, v0
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v0, 8, v0
+; ALIGNED-NEXT: global_store_byte v[14:15], v4, off offset:15
+; ALIGNED-NEXT: global_store_byte v[14:15], v3, off offset:13
+; ALIGNED-NEXT: global_store_byte v[14:15], v5, off offset:11
+; ALIGNED-NEXT: global_store_byte v[14:15], v2, off offset:9
+; ALIGNED-NEXT: global_store_byte v[14:15], v6, off offset:7
+; ALIGNED-NEXT: global_store_byte v[14:15], v1, off offset:5
+; ALIGNED-NEXT: global_store_byte v[14:15], v7, off offset:3
+; ALIGNED-NEXT: global_store_byte v[14:15], v0, off offset:1
+; ALIGNED-NEXT: v_add_co_u32 v14, vcc_lo, 0xffffff00, v14
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v15, null, -1, v15, vcc_lo
+; ALIGNED-NEXT: s_cbranch_scc0 .LBB6_5
+; ALIGNED-NEXT: .LBB6_6: ; %Flow18
+; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; ALIGNED-NEXT: s_clause 0x9 ; 40-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32
+; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4
+; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8
+; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12
+; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16
+; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20
+; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24
+; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28
+; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32
+; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: s_setpc_b64 s[30:31]
;
@@ -8410,27 +8419,31 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
; UNROLL3-NEXT: s_xor_b32 s6, exec_lo, s4
; UNROLL3-NEXT: s_cbranch_execz .LBB6_4
; UNROLL3-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader
-; UNROLL3-NEXT: s_mov_b64 s[4:5], 0
+; UNROLL3-NEXT: v_mov_b32_e32 v5, v3
+; UNROLL3-NEXT: v_mov_b32_e32 v7, v1
+; UNROLL3-NEXT: v_mov_b32_e32 v4, v2
+; UNROLL3-NEXT: v_mov_b32_e32 v6, v0
+; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7e0
; UNROLL3-NEXT: .p2align 6
; UNROLL3-NEXT: .LBB6_2: ; %memmove_fwd_loop
; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1
-; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo
-; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
; UNROLL3-NEXT: s_clause 0x2
-; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[12:13], off
-; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:16
-; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:32
-; UNROLL3-NEXT: s_add_u32 s4, s4, 48
-; UNROLL3-NEXT: s_addc_u32 s5, s5, 0
+; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[4:5], off
+; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[4:5], off offset:16
+; UNROLL3-NEXT: global_load_dwordx4 v[16:19], v[4:5], off offset:32
+; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, v4, 48
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0
+; UNROLL3-NEXT: s_addc_u32 s5, s5, -1
; UNROLL3-NEXT: s_waitcnt vmcnt(2)
-; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[4:7], off
+; UNROLL3-NEXT: global_store_dwordx4 v[6:7], v[8:11], off
; UNROLL3-NEXT: s_waitcnt vmcnt(1)
-; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:16
+; UNROLL3-NEXT: global_store_dwordx4 v[6:7], v[12:15], off offset:16
; UNROLL3-NEXT: s_waitcnt vmcnt(0)
-; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:32
-; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0x7e0
+; UNROLL3-NEXT: global_store_dwordx4 v[6:7], v[16:19], off offset:32
+; UNROLL3-NEXT: v_add_co_u32 v6, vcc_lo, v6, 48
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0
; UNROLL3-NEXT: s_cbranch_scc1 .LBB6_2
; UNROLL3-NEXT: ; %bb.3: ; %memmove_fwd_residual
; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2016
@@ -8439,44 +8452,47 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
; UNROLL3-NEXT: global_load_dwordx4 v[2:5], v[2:3], off offset:2032
; UNROLL3-NEXT: s_waitcnt vmcnt(0)
; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:2032
-; UNROLL3-NEXT: ; implicit-def: $vgpr2_vgpr3
-; UNROLL3-NEXT: ; implicit-def: $vgpr0_vgpr1
-; UNROLL3-NEXT: .LBB6_4: ; %Flow5
-; UNROLL3-NEXT: s_andn2_saveexec_b32 s8, s6
+; UNROLL3-NEXT: ; implicit-def: $vgpr2
+; UNROLL3-NEXT: ; implicit-def: $vgpr0
+; UNROLL3-NEXT: .LBB6_4: ; %Flow15
+; UNROLL3-NEXT: s_andn2_saveexec_b32 s6, s6
; UNROLL3-NEXT: s_cbranch_execz .LBB6_7
; UNROLL3-NEXT: ; %bb.5: ; %memmove_bwd_residual
; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2032
-; UNROLL3-NEXT: s_movk_i32 s6, 0xffd0
-; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7b0
-; UNROLL3-NEXT: s_mov_b32 s7, -1
+; UNROLL3-NEXT: s_movk_i32 s4, 0xf820
+; UNROLL3-NEXT: s_mov_b32 s5, -1
; UNROLL3-NEXT: s_waitcnt vmcnt(0)
; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:2032
-; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2016
+; UNROLL3-NEXT: global_load_dwordx4 v[6:9], v[2:3], off offset:2016
+; UNROLL3-NEXT: v_add_co_u32 v2, vcc_lo, 0x7b0, v2
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, 0x7b0, v0
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo
; UNROLL3-NEXT: s_waitcnt vmcnt(0)
-; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:2016
+; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:2016
; UNROLL3-NEXT: .p2align 6
; UNROLL3-NEXT: .LBB6_6: ; %memmove_bwd_loop
; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1
-; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo
-; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
; UNROLL3-NEXT: s_clause 0x2
-; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[12:13], off
-; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:16
-; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:32
-; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0
-; UNROLL3-NEXT: s_addc_u32 s5, s5, -1
+; UNROLL3-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
+; UNROLL3-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16
+; UNROLL3-NEXT: global_load_dwordx4 v[14:17], v[2:3], off offset:32
+; UNROLL3-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffffd0, v2
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
+; UNROLL3-NEXT: s_add_u32 s4, s4, 48
+; UNROLL3-NEXT: s_addc_u32 s5, s5, 0
; UNROLL3-NEXT: s_waitcnt vmcnt(2)
-; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[4:7], off
+; UNROLL3-NEXT: global_store_dwordx4 v[4:5], v[6:9], off
; UNROLL3-NEXT: s_waitcnt vmcnt(1)
-; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:16
+; UNROLL3-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:16
; UNROLL3-NEXT: s_waitcnt vmcnt(0)
-; UNROLL3-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:32
-; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], s[6:7]
+; UNROLL3-NEXT: global_store_dwordx4 v[4:5], v[14:17], off offset:32
+; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, 0xffffffd0, v4
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, vcc_lo
+; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], 0
; UNROLL3-NEXT: s_cbranch_scc0 .LBB6_6
-; UNROLL3-NEXT: .LBB6_7: ; %Flow6
-; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; UNROLL3-NEXT: .LBB6_7: ; %Flow16
+; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s6
; UNROLL3-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 2048, i1 false)
@@ -8492,138 +8508,139 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; CHECK-NEXT: s_xor_b32 s6, exec_lo, s4
; CHECK-NEXT: s_cbranch_execz .LBB7_3
; CHECK-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader
-; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: s_mov_b64 s[4:5], 0x800
; CHECK-NEXT: .LBB7_2: ; %memmove_fwd_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4
-; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo
-; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4
-; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo
; CHECK-NEXT: s_clause 0xf
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v[96:97], off offset:240
-; CHECK-NEXT: global_load_dwordx4 v[8:11], v[96:97], off offset:224
-; CHECK-NEXT: global_load_dwordx4 v[12:15], v[96:97], off offset:208
-; CHECK-NEXT: global_load_dwordx4 v[16:19], v[96:97], off offset:192
-; CHECK-NEXT: global_load_dwordx4 v[20:23], v[96:97], off offset:176
-; CHECK-NEXT: global_load_dwordx4 v[24:27], v[96:97], off offset:160
-; CHECK-NEXT: global_load_dwordx4 v[28:31], v[96:97], off offset:144
-; CHECK-NEXT: global_load_dwordx4 v[32:35], v[96:97], off offset:128
-; CHECK-NEXT: global_load_dwordx4 v[36:39], v[96:97], off offset:112
-; CHECK-NEXT: global_load_dwordx4 v[48:51], v[96:97], off offset:96
-; CHECK-NEXT: global_load_dwordx4 v[52:55], v[96:97], off offset:80
-; CHECK-NEXT: global_load_dwordx4 v[64:67], v[96:97], off offset:64
-; CHECK-NEXT: global_load_dwordx4 v[68:71], v[96:97], off offset:48
-; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:32
-; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off offset:16
-; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off
-; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48
-; CHECK-NEXT: s_add_u32 s4, s4, 0x100
-; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo
-; CHECK-NEXT: s_addc_u32 s5, s5, 0
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:240
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:224
+; CHECK-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:208
+; CHECK-NEXT: global_load_dwordx4 v[16:19], v[2:3], off offset:192
+; CHECK-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:176
+; CHECK-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:160
+; CHECK-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:144
+; CHECK-NEXT: global_load_dwordx4 v[32:35], v[2:3], off offset:128
+; CHECK-NEXT: global_load_dwordx4 v[36:39], v[2:3], off offset:64
+; CHECK-NEXT: global_load_dwordx4 v[48:51], v[2:3], off offset:32
+; CHECK-NEXT: global_load_dwordx4 v[52:55], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[64:67], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 v[68:71], v[2:3], off offset:112
+; CHECK-NEXT: global_load_dwordx4 v[80:83], v[2:3], off offset:96
+; CHECK-NEXT: global_load_dwordx4 v[84:87], v[2:3], off offset:80
+; CHECK-NEXT: global_load_dwordx4 v[96:99], v[2:3], off offset:48
+; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, 48
+; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, 0, v1, vcc_lo
+; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, 0x100, v2
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00
+; CHECK-NEXT: s_addc_u32 s5, s5, -1
; CHECK-NEXT: s_waitcnt vmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[4:7] offset:192
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:192
; CHECK-NEXT: s_waitcnt vmcnt(14)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[8:11] offset:176
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:176
; CHECK-NEXT: s_waitcnt vmcnt(13)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[12:15] offset:160
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:160
; CHECK-NEXT: s_waitcnt vmcnt(12)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[16:19] offset:144
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:144
; CHECK-NEXT: s_waitcnt vmcnt(11)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[20:23] offset:128
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:128
; CHECK-NEXT: s_waitcnt vmcnt(10)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:112
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(9)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[28:31] offset:96
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:96
; CHECK-NEXT: s_waitcnt vmcnt(8)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:128
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[32:35] offset:128
; CHECK-NEXT: s_waitcnt vmcnt(7)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[36:39] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[36:39] offset:64
; CHECK-NEXT: s_waitcnt vmcnt(6)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:48
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[48:51] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:32
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[52:55] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[64:67]
+; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0x100, v0
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0
; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71]
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:64
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:32
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:48
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] offset:16
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99]
-; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800
; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
-; CHECK-NEXT: .LBB7_3: ; %Flow6
-; CHECK-NEXT: s_andn2_saveexec_b32 s8, s6
+; CHECK-NEXT: .LBB7_3: ; %Flow16
+; CHECK-NEXT: s_andn2_saveexec_b32 s6, s6
; CHECK-NEXT: s_cbranch_execz .LBB7_6
; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader
-; CHECK-NEXT: s_movk_i32 s6, 0xff00
-; CHECK-NEXT: s_mov_b64 s[4:5], 0x700
-; CHECK-NEXT: s_mov_b32 s7, -1
+; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0x700, v0
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; CHECK-NEXT: s_movk_i32 s4, 0xf800
+; CHECK-NEXT: s_mov_b32 s5, -1
; CHECK-NEXT: .LBB7_5: ; %memmove_bwd_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4
-; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo
-; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4
-; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo
; CHECK-NEXT: s_clause 0xf
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v[96:97], off offset:240
-; CHECK-NEXT: global_load_dwordx4 v[8:11], v[96:97], off offset:224
-; CHECK-NEXT: global_load_dwordx4 v[12:15], v[96:97], off offset:208
-; CHECK-NEXT: global_load_dwordx4 v[16:19], v[96:97], off offset:192
-; CHECK-NEXT: global_load_dwordx4 v[20:23], v[96:97], off offset:176
-; CHECK-NEXT: global_load_dwordx4 v[24:27], v[96:97], off offset:160
-; CHECK-NEXT: global_load_dwordx4 v[28:31], v[96:97], off offset:144
-; CHECK-NEXT: global_load_dwordx4 v[32:35], v[96:97], off offset:128
-; CHECK-NEXT: global_load_dwordx4 v[36:39], v[96:97], off offset:112
-; CHECK-NEXT: global_load_dwordx4 v[48:51], v[96:97], off offset:96
-; CHECK-NEXT: global_load_dwordx4 v[52:55], v[96:97], off offset:80
-; CHECK-NEXT: global_load_dwordx4 v[64:67], v[96:97], off offset:64
-; CHECK-NEXT: global_load_dwordx4 v[68:71], v[96:97], off offset:48
-; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:32
-; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off offset:16
-; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off
-; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48
-; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00
-; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo
-; CHECK-NEXT: s_addc_u32 s5, s5, -1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2032
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:2016
+; CHECK-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:2000
+; CHECK-NEXT: global_load_dwordx4 v[16:19], v[2:3], off offset:1984
+; CHECK-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:1968
+; CHECK-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:1952
+; CHECK-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:1936
+; CHECK-NEXT: global_load_dwordx4 v[32:35], v[2:3], off offset:1920
+; CHECK-NEXT: global_load_dwordx4 v[36:39], v[2:3], off offset:1856
+; CHECK-NEXT: global_load_dwordx4 v[48:51], v[2:3], off offset:1824
+; CHECK-NEXT: global_load_dwordx4 v[52:55], v[2:3], off offset:1808
+; CHECK-NEXT: global_load_dwordx4 v[64:67], v[2:3], off offset:1792
+; CHECK-NEXT: global_load_dwordx4 v[68:71], v[2:3], off offset:1904
+; CHECK-NEXT: global_load_dwordx4 v[80:83], v[2:3], off offset:1888
+; CHECK-NEXT: global_load_dwordx4 v[84:87], v[2:3], off offset:1872
+; CHECK-NEXT: global_load_dwordx4 v[96:99], v[2:3], off offset:1840
+; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, 48
+; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, 0, v1, vcc_lo
+; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff00, v2
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
+; CHECK-NEXT: s_add_u32 s4, s4, 0x100
+; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: s_waitcnt vmcnt(15)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[4:7] offset:192
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:192
; CHECK-NEXT: s_waitcnt vmcnt(14)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[8:11] offset:176
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:176
; CHECK-NEXT: s_waitcnt vmcnt(13)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[12:15] offset:160
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:160
; CHECK-NEXT: s_waitcnt vmcnt(12)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[16:19] offset:144
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:144
; CHECK-NEXT: s_waitcnt vmcnt(11)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[20:23] offset:128
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:128
; CHECK-NEXT: s_waitcnt vmcnt(10)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:112
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(9)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[28:31] offset:96
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:96
; CHECK-NEXT: s_waitcnt vmcnt(8)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:128
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[32:35] offset:128
; CHECK-NEXT: s_waitcnt vmcnt(7)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[36:39] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[36:39] offset:64
; CHECK-NEXT: s_waitcnt vmcnt(6)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:48
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[48:51] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:32
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[52:55] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[64:67]
+; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffff00, v0
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 0
; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71]
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:64
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:32
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:48
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] offset:16
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99]
-; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7]
; CHECK-NEXT: s_cbranch_scc0 .LBB7_5
-; CHECK-NEXT: .LBB7_6: ; %Flow7
-; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT: .LBB7_6: ; %Flow17
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
;
@@ -8635,1005 +8652,1004 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s4
; ALIGNED-NEXT: s_cbranch_execz .LBB7_3
; ALIGNED-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader
-; ALIGNED-NEXT: s_mov_b64 s[4:5], 0
+; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x800
; ALIGNED-NEXT: .LBB7_2: ; %memmove_fwd_loop
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
-; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v84, vcc_lo, v0, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v85, null, s5, v1, vcc_lo
; ALIGNED-NEXT: s_clause 0xf
-; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[4:5], off offset:240
-; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[4:5], off offset:224
-; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:208
-; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[4:5], off offset:192
-; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[4:5], off offset:176
-; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[4:5], off offset:160
-; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[4:5], off offset:144
-; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[4:5], off offset:128
-; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[4:5], off offset:112
-; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[4:5], off offset:96
-; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[4:5], off offset:80
-; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[4:5], off offset:64
-; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[4:5], off offset:48
-; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[4:5], off offset:32
-; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[4:5], off offset:16
-; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[4:5], off
-; ALIGNED-NEXT: v_add_co_u32 v86, vcc_lo, v84, 6
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v87, null, 0, v85, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v84, 3
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v85, vcc_lo
-; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100
-; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
+; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[2:3], off offset:240
+; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[2:3], off offset:224
+; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[2:3], off offset:208
+; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[2:3], off offset:192
+; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[2:3], off offset:176
+; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[2:3], off offset:160
+; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[2:3], off offset:144
+; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[2:3], off offset:128
+; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[2:3], off offset:112
+; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:96
+; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:80
+; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:64
+; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[2:3], off offset:48
+; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:32
+; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:16
+; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v0, 6
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v1, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v98, vcc_lo, v0, 3
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v99, null, 0, v1, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, 0x100, v2
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00
+; ALIGNED-NEXT: s_addc_u32 s5, s5, -1
; ALIGNED-NEXT: s_waitcnt vmcnt(15)
-; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:168
-; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:172
-; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:164
-; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:160
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v100 offset:244
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v100
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v100
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v101 offset:248
-; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:249
-; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:245
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v99 offset:240
-; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:241
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v98 offset:236
-; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:237
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v101
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v99
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 8, v99
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 24, v98
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v119, 8, v98
-; ALIGNED-NEXT: s_waitcnt vmcnt(14)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v114
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v114
-; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:248
-; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:246
-; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:252
-; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:250
-; ALIGNED-NEXT: flat_store_byte v[96:97], v116 offset:244
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v115
-; ALIGNED-NEXT: flat_store_byte v[96:97], v117 offset:242
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v115
-; ALIGNED-NEXT: flat_store_byte v[96:97], v118 offset:240
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v113
-; ALIGNED-NEXT: flat_store_byte v[96:97], v119 offset:238
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v113
-; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:184
-; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:188
-; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:180
-; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:176
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v114 offset:228
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v115 offset:232
-; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:233
-; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:229
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v113 offset:224
-; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:225
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v112 offset:220
-; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:221
+; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:168
+; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:172
+; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:164
+; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:160
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v114 offset:244
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v114
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v114
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v115 offset:248
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v115
+; ALIGNED-NEXT: flat_store_byte v[98:99], v115 offset:249
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v115
+; ALIGNED-NEXT: flat_store_byte v[98:99], v114 offset:245
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v113 offset:240
+; ALIGNED-NEXT: flat_store_byte v[98:99], v113 offset:241
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v112 offset:236
+; ALIGNED-NEXT: flat_store_byte v[98:99], v112 offset:237
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v113
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v113
; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v112
; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v112
+; ALIGNED-NEXT: flat_store_byte v[98:99], v100 offset:248
+; ALIGNED-NEXT: s_waitcnt vmcnt(14)
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v86
+; ALIGNED-NEXT: flat_store_byte v[98:99], v101 offset:246
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v86
+; ALIGNED-NEXT: flat_store_byte v[98:99], v102 offset:252
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v87
+; ALIGNED-NEXT: flat_store_byte v[98:99], v103 offset:250
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v87
+; ALIGNED-NEXT: flat_store_byte v[98:99], v115 offset:244
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v85
+; ALIGNED-NEXT: flat_store_byte v[98:99], v114 offset:242
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v85
+; ALIGNED-NEXT: flat_store_byte v[98:99], v113 offset:240
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v84
+; ALIGNED-NEXT: flat_store_byte v[98:99], v112 offset:238
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v84
+; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:184
+; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:188
+; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:180
+; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:176
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v86 offset:228
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v87 offset:232
+; ALIGNED-NEXT: flat_store_byte v[98:99], v87 offset:233
+; ALIGNED-NEXT: flat_store_byte v[98:99], v86 offset:229
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v85 offset:224
+; ALIGNED-NEXT: flat_store_byte v[98:99], v85 offset:225
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v84 offset:220
+; ALIGNED-NEXT: flat_store_byte v[98:99], v84 offset:221
; ALIGNED-NEXT: s_waitcnt vmcnt(13)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v82
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v82
-; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:232
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v83
-; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:230
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v83
-; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:236
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v82
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v82
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v83
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v83
+; ALIGNED-NEXT: flat_store_byte v[98:99], v100 offset:232
; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v81
-; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:234
+; ALIGNED-NEXT: flat_store_byte v[98:99], v101 offset:230
; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v81
-; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:228
+; ALIGNED-NEXT: flat_store_byte v[98:99], v102 offset:236
; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v80
-; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:226
+; ALIGNED-NEXT: flat_store_byte v[98:99], v103 offset:234
; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v80
-; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:224
+; ALIGNED-NEXT: flat_store_byte v[98:99], v115 offset:228
; ALIGNED-NEXT: s_waitcnt vmcnt(12)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v70
-; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:222
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v70
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v70
+; ALIGNED-NEXT: flat_store_byte v[98:99], v114 offset:226
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v70
+; ALIGNED-NEXT: flat_store_byte v[98:99], v113 offset:224
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v71
+; ALIGNED-NEXT: flat_store_byte v[98:99], v112 offset:222
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v71
; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:136
; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:140
; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:132
; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:128
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v82 offset:212
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v83 offset:216
-; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:217
-; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:213
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v81 offset:208
-; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:209
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v80 offset:204
-; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:205
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v71
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v71
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v69
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v69
-; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:216
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v68
-; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:214
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v68
-; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:220
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:212
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:216
+; ALIGNED-NEXT: flat_store_byte v[98:99], v83 offset:217
+; ALIGNED-NEXT: flat_store_byte v[98:99], v82 offset:213
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:208
+; ALIGNED-NEXT: flat_store_byte v[98:99], v81 offset:209
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:204
+; ALIGNED-NEXT: flat_store_byte v[98:99], v80 offset:205
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v69
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v69
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v68
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v68
+; ALIGNED-NEXT: flat_store_byte v[98:99], v84 offset:216
; ALIGNED-NEXT: s_waitcnt vmcnt(11)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v66
-; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:218
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v66
-; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:212
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v67
-; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:210
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v67
-; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:208
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v65
-; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:206
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v65
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v66
+; ALIGNED-NEXT: flat_store_byte v[98:99], v85 offset:214
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v66
+; ALIGNED-NEXT: flat_store_byte v[98:99], v86 offset:220
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v67
+; ALIGNED-NEXT: flat_store_byte v[98:99], v87 offset:218
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v67
+; ALIGNED-NEXT: flat_store_byte v[98:99], v100 offset:212
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v65
+; ALIGNED-NEXT: flat_store_byte v[98:99], v101 offset:210
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v65
+; ALIGNED-NEXT: flat_store_byte v[98:99], v102 offset:208
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v64
+; ALIGNED-NEXT: flat_store_byte v[98:99], v103 offset:206
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v64
; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:152
; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:156
; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:148
; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:144
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v70 offset:196
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v71 offset:200
-; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:201
-; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:197
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v69 offset:192
-; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:193
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v68 offset:188
-; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:189
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v64
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v64
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:196
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:200
+; ALIGNED-NEXT: flat_store_byte v[98:99], v71 offset:201
+; ALIGNED-NEXT: flat_store_byte v[98:99], v70 offset:197
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:192
+; ALIGNED-NEXT: flat_store_byte v[98:99], v69 offset:193
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:188
+; ALIGNED-NEXT: flat_store_byte v[98:99], v68 offset:189
; ALIGNED-NEXT: s_waitcnt vmcnt(10)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v54
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v54
-; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:200
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55
-; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:198
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v55
-; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:204
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v52
-; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:202
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v52
-; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:196
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v53
-; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:194
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v53
-; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:192
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v54
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v54
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v55
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v55
+; ALIGNED-NEXT: flat_store_byte v[98:99], v115 offset:200
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v52
+; ALIGNED-NEXT: flat_store_byte v[98:99], v114 offset:198
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v52
+; ALIGNED-NEXT: flat_store_byte v[98:99], v113 offset:204
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53
+; ALIGNED-NEXT: flat_store_byte v[98:99], v112 offset:202
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v53
+; ALIGNED-NEXT: flat_store_byte v[98:99], v80 offset:196
; ALIGNED-NEXT: s_waitcnt vmcnt(9)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v50
-; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:190
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v50
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v50
+; ALIGNED-NEXT: flat_store_byte v[98:99], v81 offset:194
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v50
+; ALIGNED-NEXT: flat_store_byte v[98:99], v82 offset:192
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v51
+; ALIGNED-NEXT: flat_store_byte v[98:99], v83 offset:190
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v51
; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:232
; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:236
; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:228
; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:224
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v66 offset:180
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v67 offset:184
-; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:185
-; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:181
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v65 offset:176
-; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:177
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v64 offset:172
-; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:173
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v51
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v51
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v49
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v49
-; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:184
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v48
-; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:182
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v48
-; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:188
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:180
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:184
+; ALIGNED-NEXT: flat_store_byte v[98:99], v67 offset:185
+; ALIGNED-NEXT: flat_store_byte v[98:99], v66 offset:181
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:176
+; ALIGNED-NEXT: flat_store_byte v[98:99], v65 offset:177
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:172
+; ALIGNED-NEXT: flat_store_byte v[98:99], v64 offset:173
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v49
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v49
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v48
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v48
+; ALIGNED-NEXT: flat_store_byte v[98:99], v84 offset:184
; ALIGNED-NEXT: s_waitcnt vmcnt(8)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v38
-; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:186
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v38
-; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:180
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v39
-; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:178
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v39
-; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:176
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v37
-; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:174
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v37
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v38
+; ALIGNED-NEXT: flat_store_byte v[98:99], v85 offset:182
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v38
+; ALIGNED-NEXT: flat_store_byte v[98:99], v86 offset:188
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v39
+; ALIGNED-NEXT: flat_store_byte v[98:99], v87 offset:186
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v39
+; ALIGNED-NEXT: flat_store_byte v[98:99], v100 offset:180
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v37
+; ALIGNED-NEXT: flat_store_byte v[98:99], v101 offset:178
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v37
+; ALIGNED-NEXT: flat_store_byte v[98:99], v102 offset:176
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v36
+; ALIGNED-NEXT: flat_store_byte v[98:99], v103 offset:174
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v36
; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252
; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:248
; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:244
; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:240
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v54 offset:164
-; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:165
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v55 offset:168
-; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:169
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v52 offset:156
-; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:157
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v53 offset:160
-; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:161
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v36
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v36
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:164
+; ALIGNED-NEXT: flat_store_byte v[98:99], v54 offset:165
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:168
+; ALIGNED-NEXT: flat_store_byte v[98:99], v55 offset:169
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:156
+; ALIGNED-NEXT: flat_store_byte v[98:99], v52 offset:157
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:160
+; ALIGNED-NEXT: flat_store_byte v[98:99], v53 offset:161
; ALIGNED-NEXT: s_waitcnt vmcnt(7)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v34
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v34
-; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:168
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v35
-; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:166
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v35
-; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:172
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33
-; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:170
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v33
-; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:160
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v32
-; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:158
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v32
-; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:164
-; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:162
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v34
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v34
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v35
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v35
+; ALIGNED-NEXT: flat_store_byte v[98:99], v68 offset:168
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v33
+; ALIGNED-NEXT: flat_store_byte v[98:99], v69 offset:166
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v33
+; ALIGNED-NEXT: flat_store_byte v[98:99], v70 offset:172
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v32
+; ALIGNED-NEXT: flat_store_byte v[98:99], v71 offset:170
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v32
+; ALIGNED-NEXT: flat_store_byte v[98:99], v115 offset:160
+; ALIGNED-NEXT: flat_store_byte v[98:99], v114 offset:158
+; ALIGNED-NEXT: flat_store_byte v[98:99], v113 offset:164
+; ALIGNED-NEXT: flat_store_byte v[98:99], v112 offset:162
; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200
; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:204
; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196
; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:192
-; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:152
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:148
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:152
+; ALIGNED-NEXT: flat_store_byte v[98:99], v51 offset:153
+; ALIGNED-NEXT: flat_store_byte v[98:99], v50 offset:149
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:144
+; ALIGNED-NEXT: flat_store_byte v[98:99], v49 offset:145
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:140
+; ALIGNED-NEXT: flat_store_byte v[98:99], v48 offset:141
; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v28
-; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:150
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v28
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v30
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v30
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v50 offset:148
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v51 offset:152
-; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:153
-; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:149
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v49 offset:144
-; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:145
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v48 offset:140
-; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:141
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v31
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v31
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v29
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v29
-; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:156
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v29
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v29
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v28
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v30
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v30
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v31
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v31
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v28
+; ALIGNED-NEXT: flat_store_byte v[98:99], v80 offset:152
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v26
-; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:154
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v26
-; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:148
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v27
-; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:146
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v27
-; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:144
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v25
-; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:142
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v25
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v26
+; ALIGNED-NEXT: flat_store_byte v[98:99], v81 offset:150
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v26
+; ALIGNED-NEXT: flat_store_byte v[98:99], v82 offset:156
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v27
+; ALIGNED-NEXT: flat_store_byte v[98:99], v83 offset:154
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v27
+; ALIGNED-NEXT: flat_store_byte v[98:99], v64 offset:148
+; ALIGNED-NEXT: flat_store_byte v[98:99], v65 offset:146
+; ALIGNED-NEXT: flat_store_byte v[98:99], v66 offset:144
+; ALIGNED-NEXT: flat_store_byte v[98:99], v67 offset:142
; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:216
; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220
; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:212
; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:208
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v38 offset:132
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v39 offset:136
-; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:137
-; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:133
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v37 offset:128
-; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:129
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v36 offset:124
-; ALIGNED-NEXT: flat_store_byte v[84:85], v36 offset:128
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:132
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:136
+; ALIGNED-NEXT: flat_store_byte v[98:99], v39 offset:137
+; ALIGNED-NEXT: flat_store_byte v[98:99], v38 offset:133
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:128
+; ALIGNED-NEXT: flat_store_byte v[98:99], v37 offset:129
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:124
+; ALIGNED-NEXT: flat_store_byte v[0:1], v36 offset:128
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v22
-; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:136
-; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:134
-; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:140
-; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:138
-; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:132
-; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:130
-; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:128
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v22
+; ALIGNED-NEXT: flat_store_byte v[98:99], v84 offset:136
+; ALIGNED-NEXT: flat_store_byte v[98:99], v85 offset:134
+; ALIGNED-NEXT: flat_store_byte v[98:99], v86 offset:140
+; ALIGNED-NEXT: flat_store_byte v[98:99], v87 offset:138
+; ALIGNED-NEXT: flat_store_byte v[98:99], v100 offset:132
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v18
-; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:126
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v18
+; ALIGNED-NEXT: flat_store_byte v[98:99], v101 offset:130
+; ALIGNED-NEXT: flat_store_byte v[98:99], v102 offset:128
+; ALIGNED-NEXT: flat_store_byte v[98:99], v103 offset:126
; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40
; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:44
; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:36
; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v34 offset:116
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v35 offset:120
-; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:121
-; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:117
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v33 offset:112
-; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:113
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v32 offset:108
-; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:109
-; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:120
-; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:118
-; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:124
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:116
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:120
+; ALIGNED-NEXT: flat_store_byte v[98:99], v35 offset:121
+; ALIGNED-NEXT: flat_store_byte v[98:99], v34 offset:117
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:112
+; ALIGNED-NEXT: flat_store_byte v[98:99], v33 offset:113
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:108
+; ALIGNED-NEXT: flat_store_byte v[98:99], v32 offset:109
+; ALIGNED-NEXT: flat_store_byte v[98:99], v52 offset:120
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v14
-; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:122
-; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:116
-; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:114
-; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:112
-; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:110
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v14
+; ALIGNED-NEXT: flat_store_byte v[98:99], v53 offset:118
+; ALIGNED-NEXT: flat_store_byte v[98:99], v54 offset:124
+; ALIGNED-NEXT: flat_store_byte v[98:99], v55 offset:122
+; ALIGNED-NEXT: flat_store_byte v[98:99], v68 offset:116
+; ALIGNED-NEXT: flat_store_byte v[98:99], v69 offset:114
+; ALIGNED-NEXT: flat_store_byte v[98:99], v70 offset:112
+; ALIGNED-NEXT: flat_store_byte v[98:99], v71 offset:110
; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:56
; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60
; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52
; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v30 offset:100
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v31 offset:104
-; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:105
-; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:101
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v29 offset:96
-; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:97
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v28 offset:92
-; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:93
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v10
-; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:96
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v6
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v22
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v18
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v14
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v10
-; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:94
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v6
-; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v23
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v19
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v15
-; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:104
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v11
-; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:102
-; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:108
-; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:106
-; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:100
-; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:98
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:100
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:104
+; ALIGNED-NEXT: flat_store_byte v[98:99], v31 offset:105
+; ALIGNED-NEXT: flat_store_byte v[98:99], v30 offset:101
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:96
+; ALIGNED-NEXT: flat_store_byte v[98:99], v29 offset:97
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:92
+; ALIGNED-NEXT: flat_store_byte v[98:99], v28 offset:93
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v10
+; ALIGNED-NEXT: flat_store_byte v[98:99], v48 offset:100
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v6
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v22
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v18
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v14
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v10
+; ALIGNED-NEXT: flat_store_byte v[98:99], v49 offset:98
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v6
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v25
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v23
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v19
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v15
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v11
+; ALIGNED-NEXT: flat_store_byte v[98:99], v50 offset:96
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v7
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v25
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v24
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v24
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v23
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v21
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v21
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v20
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v20
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v19
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v17
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v17
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v16
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v16
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v15
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v13
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v13
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v12
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v12
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v11
+; ALIGNED-NEXT: flat_store_byte v[98:99], v115 offset:104
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v9
+; ALIGNED-NEXT: flat_store_byte v[98:99], v114 offset:102
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v9
+; ALIGNED-NEXT: flat_store_byte v[98:99], v113 offset:108
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v8
+; ALIGNED-NEXT: flat_store_byte v[98:99], v112 offset:106
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v8
+; ALIGNED-NEXT: flat_store_byte v[98:99], v51 offset:94
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v7
; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:8
; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:12
; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:4
; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32
-; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:88
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v7
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v24
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v24
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v23
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v21
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v20
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v20
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v19
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v17
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v17
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v16
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v16
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v15
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v13
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v13
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v12
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v12
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v11
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v9
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v9
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v8
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v8
-; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:86
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v7
-; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:92
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v5
-; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:90
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v5
-; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:84
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v4
-; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:82
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v4
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v26 offset:84
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v27 offset:88
-; ALIGNED-NEXT: flat_store_byte v[96:97], v27 offset:89
-; ALIGNED-NEXT: flat_store_byte v[96:97], v26 offset:85
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v25 offset:80
-; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:81
-; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:80
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v24 offset:76
-; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:78
-; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:77
+; ALIGNED-NEXT: flat_store_byte v[98:99], v80 offset:88
+; ALIGNED-NEXT: flat_store_byte v[98:99], v81 offset:86
+; ALIGNED-NEXT: flat_store_byte v[98:99], v82 offset:92
+; ALIGNED-NEXT: flat_store_byte v[98:99], v83 offset:90
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v4
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:84
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:88
+; ALIGNED-NEXT: flat_store_byte v[98:99], v27 offset:89
+; ALIGNED-NEXT: flat_store_byte v[98:99], v26 offset:85
+; ALIGNED-NEXT: flat_store_byte v[98:99], v64 offset:84
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:80
+; ALIGNED-NEXT: flat_store_byte v[98:99], v65 offset:82
+; ALIGNED-NEXT: flat_store_byte v[98:99], v25 offset:81
+; ALIGNED-NEXT: flat_store_byte v[98:99], v66 offset:80
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:76
+; ALIGNED-NEXT: flat_store_byte v[98:99], v67 offset:78
+; ALIGNED-NEXT: flat_store_byte v[98:99], v24 offset:77
; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:24
; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:28
; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:20
; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:16
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v22 offset:68
-; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:72
-; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:70
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v23 offset:72
-; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:76
-; ALIGNED-NEXT: flat_store_byte v[96:97], v23 offset:73
-; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:74
-; ALIGNED-NEXT: flat_store_byte v[96:97], v22 offset:69
-; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:68
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v21 offset:64
-; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:66
-; ALIGNED-NEXT: flat_store_byte v[96:97], v21 offset:65
-; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:64
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v20 offset:60
-; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:62
-; ALIGNED-NEXT: flat_store_byte v[84:85], v20 offset:64
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:68
+; ALIGNED-NEXT: flat_store_byte v[98:99], v36 offset:72
+; ALIGNED-NEXT: flat_store_byte v[98:99], v37 offset:70
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:72
+; ALIGNED-NEXT: flat_store_byte v[98:99], v38 offset:76
+; ALIGNED-NEXT: flat_store_byte v[98:99], v23 offset:73
+; ALIGNED-NEXT: flat_store_byte v[98:99], v39 offset:74
+; ALIGNED-NEXT: flat_store_byte v[98:99], v22 offset:69
+; ALIGNED-NEXT: flat_store_byte v[98:99], v84 offset:68
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:64
+; ALIGNED-NEXT: flat_store_byte v[98:99], v85 offset:66
+; ALIGNED-NEXT: flat_store_byte v[98:99], v21 offset:65
+; ALIGNED-NEXT: flat_store_byte v[98:99], v86 offset:64
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:60
+; ALIGNED-NEXT: flat_store_byte v[98:99], v87 offset:62
+; ALIGNED-NEXT: flat_store_byte v[0:1], v20 offset:64
; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104
; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108
; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100
; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v18 offset:52
-; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:56
-; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:54
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v19 offset:56
-; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:60
-; ALIGNED-NEXT: flat_store_byte v[96:97], v19 offset:57
-; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:58
-; ALIGNED-NEXT: flat_store_byte v[96:97], v18 offset:53
-; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:52
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v17 offset:48
-; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:50
-; ALIGNED-NEXT: flat_store_byte v[96:97], v17 offset:49
-; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:48
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v16 offset:44
-; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:46
-; ALIGNED-NEXT: flat_store_byte v[96:97], v16 offset:45
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:52
+; ALIGNED-NEXT: flat_store_byte v[98:99], v100 offset:56
+; ALIGNED-NEXT: flat_store_byte v[98:99], v101 offset:54
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:56
+; ALIGNED-NEXT: flat_store_byte v[98:99], v102 offset:60
+; ALIGNED-NEXT: flat_store_byte v[98:99], v19 offset:57
+; ALIGNED-NEXT: flat_store_byte v[98:99], v103 offset:58
+; ALIGNED-NEXT: flat_store_byte v[98:99], v18 offset:53
+; ALIGNED-NEXT: flat_store_byte v[98:99], v32 offset:52
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:48
+; ALIGNED-NEXT: flat_store_byte v[98:99], v33 offset:50
+; ALIGNED-NEXT: flat_store_byte v[98:99], v17 offset:49
+; ALIGNED-NEXT: flat_store_byte v[98:99], v34 offset:48
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:44
+; ALIGNED-NEXT: flat_store_byte v[98:99], v35 offset:46
+; ALIGNED-NEXT: flat_store_byte v[98:99], v16 offset:45
; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120
; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124
; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116
; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v14 offset:36
-; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:40
-; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:38
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v15 offset:40
-; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:44
-; ALIGNED-NEXT: flat_store_byte v[96:97], v15 offset:41
-; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:42
-; ALIGNED-NEXT: flat_store_byte v[96:97], v14 offset:37
-; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:36
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v13 offset:32
-; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:34
-; ALIGNED-NEXT: flat_store_byte v[96:97], v13 offset:33
-; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:32
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v12 offset:28
-; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:30
-; ALIGNED-NEXT: flat_store_byte v[84:85], v12 offset:32
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:36
+; ALIGNED-NEXT: flat_store_byte v[98:99], v52 offset:40
+; ALIGNED-NEXT: flat_store_byte v[98:99], v53 offset:38
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:40
+; ALIGNED-NEXT: flat_store_byte v[98:99], v54 offset:44
+; ALIGNED-NEXT: flat_store_byte v[98:99], v15 offset:41
+; ALIGNED-NEXT: flat_store_byte v[98:99], v55 offset:42
+; ALIGNED-NEXT: flat_store_byte v[98:99], v14 offset:37
+; ALIGNED-NEXT: flat_store_byte v[98:99], v68 offset:36
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:32
+; ALIGNED-NEXT: flat_store_byte v[98:99], v69 offset:34
+; ALIGNED-NEXT: flat_store_byte v[98:99], v13 offset:33
+; ALIGNED-NEXT: flat_store_byte v[98:99], v70 offset:32
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:28
+; ALIGNED-NEXT: flat_store_byte v[98:99], v71 offset:30
+; ALIGNED-NEXT: flat_store_byte v[0:1], v12 offset:32
; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72
; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:76
; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68
; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v10 offset:20
-; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:24
-; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:22
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v11 offset:24
-; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:28
-; ALIGNED-NEXT: flat_store_byte v[96:97], v11 offset:25
-; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:26
-; ALIGNED-NEXT: flat_store_byte v[96:97], v10 offset:21
-; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:20
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v9 offset:16
-; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:18
-; ALIGNED-NEXT: flat_store_byte v[96:97], v9 offset:17
-; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:16
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v8 offset:12
-; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:14
-; ALIGNED-NEXT: flat_store_byte v[84:85], v8 offset:16
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:20
+; ALIGNED-NEXT: flat_store_byte v[98:99], v28 offset:24
+; ALIGNED-NEXT: flat_store_byte v[98:99], v29 offset:22
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:24
+; ALIGNED-NEXT: flat_store_byte v[98:99], v30 offset:28
+; ALIGNED-NEXT: flat_store_byte v[98:99], v11 offset:25
+; ALIGNED-NEXT: flat_store_byte v[98:99], v31 offset:26
+; ALIGNED-NEXT: flat_store_byte v[98:99], v10 offset:21
+; ALIGNED-NEXT: flat_store_byte v[98:99], v115 offset:20
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:16
+; ALIGNED-NEXT: flat_store_byte v[98:99], v114 offset:18
+; ALIGNED-NEXT: flat_store_byte v[98:99], v9 offset:17
+; ALIGNED-NEXT: flat_store_byte v[98:99], v113 offset:16
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:12
+; ALIGNED-NEXT: flat_store_byte v[98:99], v112 offset:14
+; ALIGNED-NEXT: flat_store_byte v[0:1], v8 offset:16
; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88
; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92
; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84
; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v6 offset:4
-; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:8
-; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:6
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v7 offset:8
-; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:12
-; ALIGNED-NEXT: flat_store_byte v[96:97], v7 offset:9
-; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:10
-; ALIGNED-NEXT: flat_store_byte v[84:85], v6 offset:8
-; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:4
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v5
-; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:2
-; ALIGNED-NEXT: flat_store_byte v[84:85], v5 offset:4
-; ALIGNED-NEXT: flat_store_byte v[96:97], v98
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v4 offset:2
-; ALIGNED-NEXT: flat_store_byte v[84:85], v99 offset:1
-; ALIGNED-NEXT: flat_store_byte v[84:85], v4
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:4
+; ALIGNED-NEXT: flat_store_byte v[98:99], v48 offset:8
+; ALIGNED-NEXT: flat_store_byte v[98:99], v49 offset:6
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:8
+; ALIGNED-NEXT: flat_store_byte v[98:99], v50 offset:12
+; ALIGNED-NEXT: flat_store_byte v[98:99], v7 offset:9
+; ALIGNED-NEXT: flat_store_byte v[98:99], v51 offset:10
+; ALIGNED-NEXT: flat_store_byte v[0:1], v6 offset:8
+; ALIGNED-NEXT: flat_store_byte v[0:1], v5 offset:4
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v4 offset:2
+; ALIGNED-NEXT: flat_store_byte v[0:1], v83 offset:1
+; ALIGNED-NEXT: flat_store_byte v[0:1], v4
+; ALIGNED-NEXT: v_add_co_u32 v0, vcc_lo, 0x100, v0
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v5
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v5
+; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v4
+; ALIGNED-NEXT: flat_store_byte v[98:99], v80 offset:4
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5
+; ALIGNED-NEXT: flat_store_byte v[98:99], v81 offset:2
+; ALIGNED-NEXT: flat_store_byte v[98:99], v82
; ALIGNED-NEXT: s_cbranch_scc1 .LBB7_2
-; ALIGNED-NEXT: .LBB7_3: ; %Flow6
-; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6
+; ALIGNED-NEXT: .LBB7_3: ; %Flow16
+; ALIGNED-NEXT: s_andn2_saveexec_b32 s6, s6
; ALIGNED-NEXT: s_cbranch_execz .LBB7_6
; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader
-; ALIGNED-NEXT: s_movk_i32 s6, 0xff00
-; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x700
-; ALIGNED-NEXT: s_mov_b32 s7, -1
+; ALIGNED-NEXT: v_add_co_u32 v0, vcc_lo, 0x700, v0
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; ALIGNED-NEXT: s_movk_i32 s4, 0xf800
+; ALIGNED-NEXT: s_mov_b32 s5, -1
; ALIGNED-NEXT: .LBB7_5: ; %memmove_bwd_loop
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
-; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v84, vcc_lo, v0, s4
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v85, null, s5, v1, vcc_lo
; ALIGNED-NEXT: s_clause 0xf
-; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[4:5], off offset:240
-; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[4:5], off offset:224
-; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:208
-; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[4:5], off offset:192
-; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[4:5], off offset:176
-; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[4:5], off offset:160
-; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[4:5], off offset:144
-; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[4:5], off offset:128
-; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[4:5], off offset:112
-; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[4:5], off offset:96
-; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[4:5], off offset:80
-; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[4:5], off offset:64
-; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[4:5], off offset:48
-; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[4:5], off offset:32
-; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[4:5], off offset:16
-; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[4:5], off
-; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v84, 6
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v85, vcc_lo
-; ALIGNED-NEXT: v_add_co_u32 v86, vcc_lo, v84, 3
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v87, null, 0, v85, vcc_lo
-; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00
-; ALIGNED-NEXT: s_addc_u32 s5, s5, -1
+; ALIGNED-NEXT: global_load_dwordx4 v[100:103], v[2:3], off offset:2032
+; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[2:3], off offset:2016
+; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[2:3], off offset:2000
+; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[2:3], off offset:1984
+; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[2:3], off offset:1968
+; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[2:3], off offset:1952
+; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[2:3], off offset:1936
+; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[2:3], off offset:1920
+; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[2:3], off offset:1904
+; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:1888
+; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:1872
+; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:1856
+; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[2:3], off offset:1840
+; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:1824
+; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:1792
+; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:1808
+; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v0, 6
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v1, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v98, vcc_lo, v0, 3
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v99, null, 0, v1, vcc_lo
+; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff00, v2
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
+; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100
+; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
; ALIGNED-NEXT: s_waitcnt vmcnt(15)
-; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:424
-; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:428
-; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:420
-; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:416
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v100 offset:244
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v101 offset:248
-; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:249
-; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:245
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v99 offset:240
-; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:241
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v98 offset:236
-; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:237
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v100
+; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:296
+; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:300
+; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:292
+; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:288
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v102
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v102
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v103
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v103
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v102 offset:244
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v103 offset:248
+; ALIGNED-NEXT: flat_store_byte v[98:99], v103 offset:249
+; ALIGNED-NEXT: flat_store_byte v[98:99], v102 offset:245
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v101 offset:240
+; ALIGNED-NEXT: flat_store_byte v[98:99], v101 offset:241
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v100 offset:236
+; ALIGNED-NEXT: flat_store_byte v[98:99], v100 offset:237
; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v101
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v101
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v100
; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v99
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 8, v99
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 24, v98
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v119, 8, v98
-; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:248
-; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:246
-; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:252
+; ALIGNED-NEXT: flat_store_byte v[98:99], v115 offset:248
; ALIGNED-NEXT: s_waitcnt vmcnt(14)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v114
-; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:250
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v114
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v115
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v115
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v113
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v113
-; ALIGNED-NEXT: flat_store_byte v[86:87], v116 offset:244
-; ALIGNED-NEXT: flat_store_byte v[86:87], v117 offset:242
-; ALIGNED-NEXT: flat_store_byte v[86:87], v118 offset:240
-; ALIGNED-NEXT: flat_store_byte v[86:87], v119 offset:238
-; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:440
-; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:444
-; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:436
-; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:432
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v114 offset:228
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v115 offset:232
-; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:233
-; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:229
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v113 offset:224
-; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:225
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v112 offset:220
-; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:221
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v112
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v112
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v86
+; ALIGNED-NEXT: flat_store_byte v[98:99], v114 offset:246
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v86
+; ALIGNED-NEXT: flat_store_byte v[98:99], v113 offset:252
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87
+; ALIGNED-NEXT: flat_store_byte v[98:99], v112 offset:250
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v87
+; ALIGNED-NEXT: flat_store_byte v[98:99], v103 offset:244
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v85
+; ALIGNED-NEXT: flat_store_byte v[98:99], v102 offset:242
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v85
+; ALIGNED-NEXT: flat_store_byte v[98:99], v101 offset:240
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v84
+; ALIGNED-NEXT: flat_store_byte v[98:99], v100 offset:238
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v84
+; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:312
+; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:316
+; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:308
+; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:304
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v86 offset:228
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v87 offset:232
+; ALIGNED-NEXT: flat_store_byte v[98:99], v87 offset:233
+; ALIGNED-NEXT: flat_store_byte v[98:99], v86 offset:229
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v85 offset:224
+; ALIGNED-NEXT: flat_store_byte v[98:99], v85 offset:225
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v84 offset:220
+; ALIGNED-NEXT: flat_store_byte v[98:99], v84 offset:221
; ALIGNED-NEXT: s_waitcnt vmcnt(13)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v82
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v82
-; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:232
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v83
-; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:230
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v83
-; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:236
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v81
-; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:234
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v81
-; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:228
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v80
-; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:226
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v80
-; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:224
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v82
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v82
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v83
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v83
+; ALIGNED-NEXT: flat_store_byte v[98:99], v115 offset:232
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v81
+; ALIGNED-NEXT: flat_store_byte v[98:99], v114 offset:230
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v81
+; ALIGNED-NEXT: flat_store_byte v[98:99], v113 offset:236
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v80
+; ALIGNED-NEXT: flat_store_byte v[98:99], v112 offset:234
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v80
+; ALIGNED-NEXT: flat_store_byte v[98:99], v103 offset:228
; ALIGNED-NEXT: s_waitcnt vmcnt(12)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v70
-; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:222
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v70
-; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:392
-; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:396
-; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:388
-; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:384
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v70
+; ALIGNED-NEXT: flat_store_byte v[98:99], v102 offset:226
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v70
+; ALIGNED-NEXT: flat_store_byte v[98:99], v101 offset:224
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v71
+; ALIGNED-NEXT: flat_store_byte v[98:99], v100 offset:222
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v71
+; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:264
+; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:268
+; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:260
+; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:256
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:212
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:216
-; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:217
-; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:213
+; ALIGNED-NEXT: flat_store_byte v[98:99], v83 offset:217
+; ALIGNED-NEXT: flat_store_byte v[98:99], v82 offset:213
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:208
-; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:209
+; ALIGNED-NEXT: flat_store_byte v[98:99], v81 offset:209
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:204
-; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:205
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v71
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v71
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v69
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v69
-; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:216
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v68
-; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:214
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v68
-; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:220
+; ALIGNED-NEXT: flat_store_byte v[98:99], v80 offset:205
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v69
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v69
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v68
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v68
+; ALIGNED-NEXT: flat_store_byte v[98:99], v84 offset:216
; ALIGNED-NEXT: s_waitcnt vmcnt(11)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v66
-; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:218
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v66
-; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:212
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v67
-; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:210
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v67
-; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:208
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v65
-; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:206
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v65
-; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:408
-; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:412
-; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:404
-; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:400
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v66
+; ALIGNED-NEXT: flat_store_byte v[98:99], v85 offset:214
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v66
+; ALIGNED-NEXT: flat_store_byte v[98:99], v86 offset:220
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v67
+; ALIGNED-NEXT: flat_store_byte v[98:99], v87 offset:218
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v67
+; ALIGNED-NEXT: flat_store_byte v[98:99], v115 offset:212
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v65
+; ALIGNED-NEXT: flat_store_byte v[98:99], v114 offset:210
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v65
+; ALIGNED-NEXT: flat_store_byte v[98:99], v113 offset:208
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v64
+; ALIGNED-NEXT: flat_store_byte v[98:99], v112 offset:206
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v64
+; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:280
+; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:284
+; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:272
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:196
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:200
-; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:201
-; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:197
+; ALIGNED-NEXT: flat_store_byte v[98:99], v71 offset:201
+; ALIGNED-NEXT: flat_store_byte v[98:99], v70 offset:197
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:192
-; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:193
+; ALIGNED-NEXT: flat_store_byte v[98:99], v69 offset:193
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:188
-; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:189
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v64
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v64
+; ALIGNED-NEXT: flat_store_byte v[98:99], v68 offset:189
; ALIGNED-NEXT: s_waitcnt vmcnt(10)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v54
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v54
-; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:200
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55
-; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:198
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v55
-; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:204
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v52
-; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:202
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v52
-; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:196
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v53
-; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:194
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v53
-; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:192
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v54
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v54
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v55
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v55
+; ALIGNED-NEXT: flat_store_byte v[98:99], v103 offset:200
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v53
+; ALIGNED-NEXT: flat_store_byte v[98:99], v102 offset:198
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v53
+; ALIGNED-NEXT: flat_store_byte v[98:99], v101 offset:204
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v52
+; ALIGNED-NEXT: flat_store_byte v[98:99], v100 offset:202
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v52
+; ALIGNED-NEXT: flat_store_byte v[98:99], v80 offset:196
; ALIGNED-NEXT: s_waitcnt vmcnt(9)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v50
-; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:190
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v50
-; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:488
-; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:492
-; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:484
-; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:480
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v50
+; ALIGNED-NEXT: flat_store_byte v[98:99], v81 offset:194
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v50
+; ALIGNED-NEXT: flat_store_byte v[98:99], v82 offset:192
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v51
+; ALIGNED-NEXT: flat_store_byte v[98:99], v83 offset:190
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v51
+; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:360
+; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:364
+; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:356
+; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:352
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:180
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:184
-; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:185
-; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:181
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:176
-; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:177
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:172
-; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:173
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v51
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v51
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v49
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v49
-; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:184
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v48
-; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:182
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v48
-; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:188
-; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:186
-; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:180
-; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:178
-; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:176
-; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:174
-; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:508
-; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:504
-; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:500
-; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:496
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:164
-; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:165
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:168
-; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:169
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:156
-; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:157
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:160
-; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:161
-; ALIGNED-NEXT: s_waitcnt vmcnt(8)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v36
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v38
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v38
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v39
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v39
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v37
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v37
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v36
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:184
+; ALIGNED-NEXT: flat_store_byte v[98:99], v67 offset:185
+; ALIGNED-NEXT: flat_store_byte v[98:99], v66 offset:181
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:176
+; ALIGNED-NEXT: flat_store_byte v[98:99], v65 offset:177
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:172
+; ALIGNED-NEXT: flat_store_byte v[98:99], v64 offset:173
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v49
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v49
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v48
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v48
+; ALIGNED-NEXT: flat_store_byte v[98:99], v84 offset:184
+; ALIGNED-NEXT: s_waitcnt vmcnt(8)
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v38
+; ALIGNED-NEXT: flat_store_byte v[98:99], v85 offset:182
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v38
+; ALIGNED-NEXT: flat_store_byte v[98:99], v86 offset:188
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v39
+; ALIGNED-NEXT: flat_store_byte v[98:99], v87 offset:186
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v39
+; ALIGNED-NEXT: flat_store_byte v[98:99], v115 offset:180
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v37
+; ALIGNED-NEXT: flat_store_byte v[98:99], v114 offset:178
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v37
+; ALIGNED-NEXT: flat_store_byte v[98:99], v113 offset:176
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v36
+; ALIGNED-NEXT: flat_store_byte v[98:99], v112 offset:174
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v36
+; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:376
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:380
+; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:372
+; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:368
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:164
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:168
+; ALIGNED-NEXT: flat_store_byte v[98:99], v55 offset:169
+; ALIGNED-NEXT: flat_store_byte v[98:99], v54 offset:165
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:160
+; ALIGNED-NEXT: flat_store_byte v[98:99], v53 offset:161
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:156
+; ALIGNED-NEXT: flat_store_byte v[98:99], v52 offset:157
; ALIGNED-NEXT: s_waitcnt vmcnt(7)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v34
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v34
-; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:168
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v35
-; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:166
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v35
-; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:172
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33
-; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:170
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v33
-; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:160
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v32
-; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:158
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v32
-; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:164
-; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:162
-; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:456
-; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:460
-; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:452
-; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:448
-; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:152
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v28
-; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:150
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v28
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v30
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v30
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v34
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v34
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v35
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v35
+; ALIGNED-NEXT: flat_store_byte v[98:99], v68 offset:168
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v33
+; ALIGNED-NEXT: flat_store_byte v[98:99], v69 offset:166
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v33
+; ALIGNED-NEXT: flat_store_byte v[98:99], v70 offset:172
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v32
+; ALIGNED-NEXT: flat_store_byte v[98:99], v71 offset:170
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v32
+; ALIGNED-NEXT: flat_store_byte v[98:99], v103 offset:164
+; ALIGNED-NEXT: flat_store_byte v[98:99], v102 offset:162
+; ALIGNED-NEXT: flat_store_byte v[98:99], v101 offset:160
+; ALIGNED-NEXT: flat_store_byte v[98:99], v100 offset:158
+; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:328
+; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:332
+; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:324
+; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:320
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:148
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:152
-; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:153
-; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:149
+; ALIGNED-NEXT: flat_store_byte v[98:99], v51 offset:153
+; ALIGNED-NEXT: flat_store_byte v[98:99], v50 offset:149
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:144
-; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:145
+; ALIGNED-NEXT: flat_store_byte v[98:99], v49 offset:145
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:140
-; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:141
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v31
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v31
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v29
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v29
-; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:156
+; ALIGNED-NEXT: flat_store_byte v[98:99], v48 offset:141
+; ALIGNED-NEXT: s_waitcnt vmcnt(6)
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v29
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v29
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v28
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v30
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v30
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v31
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v31
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v28
+; ALIGNED-NEXT: flat_store_byte v[98:99], v80 offset:152
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v26
-; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:154
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v26
-; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:148
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v27
-; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:146
-; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:144
-; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:142
-; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:472
-; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:476
-; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:468
-; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:464
-; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:128
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v18
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v27
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v26
+; ALIGNED-NEXT: flat_store_byte v[98:99], v81 offset:150
+; ALIGNED-NEXT: flat_store_byte v[98:99], v82 offset:156
+; ALIGNED-NEXT: flat_store_byte v[98:99], v83 offset:154
+; ALIGNED-NEXT: flat_store_byte v[98:99], v64 offset:148
+; ALIGNED-NEXT: flat_store_byte v[98:99], v65 offset:146
+; ALIGNED-NEXT: flat_store_byte v[98:99], v66 offset:144
+; ALIGNED-NEXT: flat_store_byte v[98:99], v67 offset:142
+; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:344
+; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:348
+; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:340
+; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:336
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:132
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:136
-; ALIGNED-NEXT: flat_store_byte v[86:87], v39 offset:137
-; ALIGNED-NEXT: flat_store_byte v[86:87], v38 offset:133
+; ALIGNED-NEXT: flat_store_byte v[98:99], v39 offset:137
+; ALIGNED-NEXT: flat_store_byte v[98:99], v38 offset:133
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:128
-; ALIGNED-NEXT: flat_store_byte v[86:87], v37 offset:129
+; ALIGNED-NEXT: flat_store_byte v[98:99], v37 offset:129
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:124
-; ALIGNED-NEXT: flat_store_byte v[84:85], v36 offset:128
-; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:136
-; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:134
-; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:140
-; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:138
-; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:132
-; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:130
-; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:126
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v18
-; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296
-; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:300
-; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292
-; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288
+; ALIGNED-NEXT: flat_store_byte v[0:1], v36 offset:128
+; ALIGNED-NEXT: flat_store_byte v[98:99], v84 offset:136
+; ALIGNED-NEXT: flat_store_byte v[98:99], v85 offset:134
+; ALIGNED-NEXT: flat_store_byte v[98:99], v86 offset:140
+; ALIGNED-NEXT: flat_store_byte v[98:99], v87 offset:138
+; ALIGNED-NEXT: flat_store_byte v[98:99], v115 offset:132
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v18
+; ALIGNED-NEXT: flat_store_byte v[98:99], v114 offset:130
+; ALIGNED-NEXT: flat_store_byte v[98:99], v113 offset:128
+; ALIGNED-NEXT: flat_store_byte v[98:99], v112 offset:126
+; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:424
+; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:428
+; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420
+; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416
+; ALIGNED-NEXT: flat_store_byte v[98:99], v52 offset:120
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v14
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v26
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v18
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:116
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:120
-; ALIGNED-NEXT: flat_store_byte v[86:87], v35 offset:121
-; ALIGNED-NEXT: flat_store_byte v[86:87], v34 offset:117
+; ALIGNED-NEXT: flat_store_byte v[98:99], v35 offset:121
+; ALIGNED-NEXT: flat_store_byte v[98:99], v34 offset:117
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:112
-; ALIGNED-NEXT: flat_store_byte v[86:87], v33 offset:113
+; ALIGNED-NEXT: flat_store_byte v[98:99], v33 offset:113
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:108
-; ALIGNED-NEXT: flat_store_byte v[86:87], v32 offset:109
-; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:120
-; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:118
-; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:124
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v14
-; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:122
-; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:116
-; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:114
-; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:112
-; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:110
-; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312
-; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316
-; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308
-; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:304
+; ALIGNED-NEXT: flat_store_byte v[98:99], v32 offset:109
+; ALIGNED-NEXT: flat_store_byte v[98:99], v53 offset:118
+; ALIGNED-NEXT: flat_store_byte v[98:99], v54 offset:124
+; ALIGNED-NEXT: flat_store_byte v[98:99], v55 offset:122
+; ALIGNED-NEXT: flat_store_byte v[98:99], v68 offset:116
+; ALIGNED-NEXT: flat_store_byte v[98:99], v69 offset:114
+; ALIGNED-NEXT: flat_store_byte v[98:99], v70 offset:112
+; ALIGNED-NEXT: flat_store_byte v[98:99], v71 offset:110
+; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:440
+; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:444
+; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:436
+; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:100
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:104
-; ALIGNED-NEXT: flat_store_byte v[86:87], v31 offset:105
-; ALIGNED-NEXT: flat_store_byte v[86:87], v30 offset:101
+; ALIGNED-NEXT: flat_store_byte v[98:99], v31 offset:105
+; ALIGNED-NEXT: flat_store_byte v[98:99], v30 offset:101
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:96
-; ALIGNED-NEXT: flat_store_byte v[86:87], v29 offset:97
+; ALIGNED-NEXT: flat_store_byte v[98:99], v29 offset:97
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:92
-; ALIGNED-NEXT: flat_store_byte v[86:87], v28 offset:93
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v10
-; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:96
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v6
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v25
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v19
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v14
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v10
-; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:94
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v6
-; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7]
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v25
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v19
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v24
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v24
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v17
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v15
-; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:104
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v11
-; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:102
-; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:108
-; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:106
-; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:100
-; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:98
-; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:264
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:268
-; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:260
-; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:256
+; ALIGNED-NEXT: flat_store_byte v[98:99], v28 offset:93
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v10
+; ALIGNED-NEXT: flat_store_byte v[98:99], v48 offset:100
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v6
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v27
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v19
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v14
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v10
+; ALIGNED-NEXT: flat_store_byte v[98:99], v49 offset:98
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v6
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v27
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v25
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v15
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v11
+; ALIGNED-NEXT: flat_store_byte v[98:99], v50 offset:96
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v7
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v25
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v24
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v24
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v22
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v22
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v23
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v23
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v21
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v21
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v20
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v20
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v19
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v17
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v17
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v16
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v16
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v15
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v12
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v12
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v13
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v13
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v11
+; ALIGNED-NEXT: flat_store_byte v[98:99], v103 offset:104
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v9
+; ALIGNED-NEXT: flat_store_byte v[98:99], v102 offset:102
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v9
+; ALIGNED-NEXT: flat_store_byte v[98:99], v101 offset:108
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v8
+; ALIGNED-NEXT: flat_store_byte v[98:99], v100 offset:106
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v8
+; ALIGNED-NEXT: flat_store_byte v[98:99], v51 offset:94
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v7
+; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:392
+; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:396
+; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:388
+; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:384
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:84
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:88
-; ALIGNED-NEXT: flat_store_byte v[86:87], v27 offset:89
-; ALIGNED-NEXT: flat_store_byte v[86:87], v26 offset:85
+; ALIGNED-NEXT: flat_store_byte v[98:99], v27 offset:89
+; ALIGNED-NEXT: flat_store_byte v[98:99], v26 offset:85
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:80
-; ALIGNED-NEXT: flat_store_byte v[86:87], v25 offset:81
+; ALIGNED-NEXT: flat_store_byte v[98:99], v25 offset:81
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:76
-; ALIGNED-NEXT: flat_store_byte v[86:87], v24 offset:77
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v7
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v22
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v22
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v23
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v23
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v21
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v20
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v20
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v17
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v16
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v16
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v15
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v13
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v13
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v12
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v12
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v11
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v9
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v9
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v8
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v8
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v7
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v5
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v5
-; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:88
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v4
-; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:86
-; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v4
-; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:92
-; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:90
-; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:84
-; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:82
-; ALIGNED-NEXT: flat_store_byte v[86:87], v36 offset:80
-; ALIGNED-NEXT: flat_store_byte v[86:87], v37 offset:78
-; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280
-; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284
-; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:276
-; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:272
+; ALIGNED-NEXT: flat_store_byte v[98:99], v24 offset:77
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v4
+; ALIGNED-NEXT: flat_store_byte v[98:99], v80 offset:88
+; ALIGNED-NEXT: flat_store_byte v[98:99], v81 offset:86
+; ALIGNED-NEXT: flat_store_byte v[98:99], v82 offset:92
+; ALIGNED-NEXT: flat_store_byte v[98:99], v83 offset:90
+; ALIGNED-NEXT: flat_store_byte v[98:99], v64 offset:84
+; ALIGNED-NEXT: flat_store_byte v[98:99], v65 offset:82
+; ALIGNED-NEXT: flat_store_byte v[98:99], v66 offset:80
+; ALIGNED-NEXT: flat_store_byte v[98:99], v67 offset:78
+; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:408
+; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:412
+; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:404
+; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:400
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:68
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:72
-; ALIGNED-NEXT: flat_store_byte v[86:87], v23 offset:73
-; ALIGNED-NEXT: flat_store_byte v[86:87], v22 offset:69
+; ALIGNED-NEXT: flat_store_byte v[98:99], v23 offset:73
+; ALIGNED-NEXT: flat_store_byte v[98:99], v22 offset:69
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:64
-; ALIGNED-NEXT: flat_store_byte v[86:87], v21 offset:65
+; ALIGNED-NEXT: flat_store_byte v[98:99], v21 offset:65
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:60
-; ALIGNED-NEXT: flat_store_byte v[84:85], v20 offset:64
-; ALIGNED-NEXT: flat_store_byte v[86:87], v38 offset:72
-; ALIGNED-NEXT: flat_store_byte v[86:87], v39 offset:70
-; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:76
-; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:74
-; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:68
-; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:66
-; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:64
-; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:62
-; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:360
-; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:364
-; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:356
-; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:352
-; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:56
-; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:54
-; ALIGNED-NEXT: flat_store_byte v[86:87], v32 offset:60
-; ALIGNED-NEXT: flat_store_byte v[86:87], v33 offset:58
-; ALIGNED-NEXT: flat_store_byte v[86:87], v34 offset:52
+; ALIGNED-NEXT: flat_store_byte v[0:1], v20 offset:64
+; ALIGNED-NEXT: flat_store_byte v[98:99], v36 offset:72
+; ALIGNED-NEXT: flat_store_byte v[98:99], v37 offset:70
+; ALIGNED-NEXT: flat_store_byte v[98:99], v38 offset:76
+; ALIGNED-NEXT: flat_store_byte v[98:99], v39 offset:74
+; ALIGNED-NEXT: flat_store_byte v[98:99], v84 offset:68
+; ALIGNED-NEXT: flat_store_byte v[98:99], v85 offset:66
+; ALIGNED-NEXT: flat_store_byte v[98:99], v86 offset:64
+; ALIGNED-NEXT: flat_store_byte v[98:99], v87 offset:62
+; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:488
+; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:492
+; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:484
+; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:480
+; ALIGNED-NEXT: flat_store_byte v[98:99], v115 offset:56
+; ALIGNED-NEXT: flat_store_byte v[98:99], v114 offset:54
+; ALIGNED-NEXT: flat_store_byte v[98:99], v113 offset:60
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:52
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:56
-; ALIGNED-NEXT: flat_store_byte v[86:87], v19 offset:57
-; ALIGNED-NEXT: flat_store_byte v[86:87], v18 offset:53
+; ALIGNED-NEXT: flat_store_byte v[98:99], v19 offset:57
+; ALIGNED-NEXT: flat_store_byte v[98:99], v112 offset:58
+; ALIGNED-NEXT: flat_store_byte v[98:99], v18 offset:53
+; ALIGNED-NEXT: flat_store_byte v[98:99], v32 offset:52
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:48
-; ALIGNED-NEXT: flat_store_byte v[86:87], v35 offset:50
-; ALIGNED-NEXT: flat_store_byte v[86:87], v17 offset:49
-; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:48
+; ALIGNED-NEXT: flat_store_byte v[98:99], v33 offset:50
+; ALIGNED-NEXT: flat_store_byte v[98:99], v17 offset:49
+; ALIGNED-NEXT: flat_store_byte v[98:99], v34 offset:48
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:44
-; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:46
-; ALIGNED-NEXT: flat_store_byte v[86:87], v16 offset:45
-; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376
-; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380
-; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372
-; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:368
+; ALIGNED-NEXT: flat_store_byte v[98:99], v35 offset:46
+; ALIGNED-NEXT: flat_store_byte v[98:99], v16 offset:45
+; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:508
+; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:504
+; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:500
+; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:496
+; ALIGNED-NEXT: flat_store_byte v[98:99], v52 offset:40
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:36
-; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:40
-; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:38
+; ALIGNED-NEXT: flat_store_byte v[98:99], v53 offset:38
+; ALIGNED-NEXT: flat_store_byte v[98:99], v14 offset:37
+; ALIGNED-NEXT: flat_store_byte v[98:99], v54 offset:44
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:40
-; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:44
-; ALIGNED-NEXT: flat_store_byte v[86:87], v15 offset:41
-; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:42
-; ALIGNED-NEXT: flat_store_byte v[86:87], v14 offset:37
-; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:36
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:32
-; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:34
-; ALIGNED-NEXT: flat_store_byte v[86:87], v13 offset:33
-; ALIGNED-NEXT: flat_store_byte v[86:87], v28 offset:32
+; ALIGNED-NEXT: flat_store_byte v[98:99], v55 offset:42
+; ALIGNED-NEXT: flat_store_byte v[98:99], v15 offset:41
+; ALIGNED-NEXT: flat_store_byte v[98:99], v68 offset:32
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:28
-; ALIGNED-NEXT: flat_store_byte v[86:87], v29 offset:30
-; ALIGNED-NEXT: flat_store_byte v[84:85], v12 offset:32
-; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:328
-; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:332
-; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:324
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320
+; ALIGNED-NEXT: flat_store_byte v[98:99], v69 offset:30
+; ALIGNED-NEXT: flat_store_byte v[0:1], v12 offset:32
+; ALIGNED-NEXT: flat_store_byte v[98:99], v70 offset:36
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:32
+; ALIGNED-NEXT: flat_store_byte v[98:99], v71 offset:34
+; ALIGNED-NEXT: flat_store_byte v[98:99], v13 offset:33
+; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:456
+; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:460
+; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:452
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:448
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:20
-; ALIGNED-NEXT: flat_store_byte v[86:87], v30 offset:24
-; ALIGNED-NEXT: flat_store_byte v[86:87], v31 offset:22
+; ALIGNED-NEXT: flat_store_byte v[98:99], v28 offset:24
+; ALIGNED-NEXT: flat_store_byte v[98:99], v29 offset:22
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:24
-; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:28
-; ALIGNED-NEXT: flat_store_byte v[86:87], v11 offset:25
-; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:26
-; ALIGNED-NEXT: flat_store_byte v[86:87], v10 offset:21
-; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:20
+; ALIGNED-NEXT: flat_store_byte v[98:99], v30 offset:28
+; ALIGNED-NEXT: flat_store_byte v[98:99], v11 offset:25
+; ALIGNED-NEXT: flat_store_byte v[98:99], v31 offset:26
+; ALIGNED-NEXT: flat_store_byte v[98:99], v10 offset:21
+; ALIGNED-NEXT: flat_store_byte v[98:99], v103 offset:20
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:16
-; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:18
-; ALIGNED-NEXT: flat_store_byte v[86:87], v9 offset:17
-; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:16
+; ALIGNED-NEXT: flat_store_byte v[98:99], v102 offset:18
+; ALIGNED-NEXT: flat_store_byte v[98:99], v9 offset:17
+; ALIGNED-NEXT: flat_store_byte v[98:99], v101 offset:16
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:12
-; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:14
-; ALIGNED-NEXT: flat_store_byte v[84:85], v8 offset:16
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:344
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:348
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336
+; ALIGNED-NEXT: flat_store_byte v[98:99], v100 offset:14
+; ALIGNED-NEXT: flat_store_byte v[0:1], v8 offset:16
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:472
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:476
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:468
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:464
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:4
-; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:8
-; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:6
+; ALIGNED-NEXT: flat_store_byte v[98:99], v48 offset:8
+; ALIGNED-NEXT: flat_store_byte v[98:99], v49 offset:6
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:8
-; ALIGNED-NEXT: flat_store_byte v[86:87], v24 offset:12
-; ALIGNED-NEXT: flat_store_byte v[86:87], v7 offset:9
-; ALIGNED-NEXT: flat_store_byte v[86:87], v25 offset:10
-; ALIGNED-NEXT: flat_store_byte v[84:85], v6 offset:8
-; ALIGNED-NEXT: flat_store_byte v[86:87], v26 offset:4
+; ALIGNED-NEXT: flat_store_byte v[98:99], v50 offset:12
+; ALIGNED-NEXT: flat_store_byte v[98:99], v7 offset:9
+; ALIGNED-NEXT: flat_store_byte v[98:99], v51 offset:10
+; ALIGNED-NEXT: flat_store_byte v[0:1], v6 offset:8
+; ALIGNED-NEXT: flat_store_byte v[0:1], v5 offset:4
+; ALIGNED-NEXT: flat_store_byte_d16_hi v[0:1], v4 offset:2
+; ALIGNED-NEXT: flat_store_byte v[0:1], v27 offset:1
+; ALIGNED-NEXT: flat_store_byte v[0:1], v4
+; ALIGNED-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffff00, v0
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v5
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v5
+; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], 0
+; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v4
+; ALIGNED-NEXT: flat_store_byte v[98:99], v24 offset:4
; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5
-; ALIGNED-NEXT: flat_store_byte v[86:87], v27 offset:2
-; ALIGNED-NEXT: flat_store_byte v[84:85], v5 offset:4
-; ALIGNED-NEXT: flat_store_byte v[86:87], v64
-; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v4 offset:2
-; ALIGNED-NEXT: flat_store_byte v[84:85], v65 offset:1
-; ALIGNED-NEXT: flat_store_byte v[84:85], v4
+; ALIGNED-NEXT: flat_store_byte v[98:99], v25 offset:2
+; ALIGNED-NEXT: flat_store_byte v[98:99], v26
; ALIGNED-NEXT: s_cbranch_scc0 .LBB7_5
-; ALIGNED-NEXT: .LBB7_6: ; %Flow7
-; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; ALIGNED-NEXT: .LBB7_6: ; %Flow17
+; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6
; ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; ALIGNED-NEXT: s_setpc_b64 s[30:31]
;
@@ -9645,27 +9661,31 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; UNROLL3-NEXT: s_xor_b32 s6, exec_lo, s4
; UNROLL3-NEXT: s_cbranch_execz .LBB7_4
; UNROLL3-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader
-; UNROLL3-NEXT: s_mov_b64 s[4:5], 0
+; UNROLL3-NEXT: v_mov_b32_e32 v5, v3
+; UNROLL3-NEXT: v_mov_b32_e32 v7, v1
+; UNROLL3-NEXT: v_mov_b32_e32 v4, v2
+; UNROLL3-NEXT: v_mov_b32_e32 v6, v0
+; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7e0
; UNROLL3-NEXT: .p2align 6
; UNROLL3-NEXT: .LBB7_2: ; %memmove_fwd_loop
; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1
-; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo
-; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
; UNROLL3-NEXT: s_clause 0x2
-; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16
-; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[12:13], off
-; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:32
-; UNROLL3-NEXT: s_add_u32 s4, s4, 48
-; UNROLL3-NEXT: s_addc_u32 s5, s5, 0
+; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[4:5], off offset:16
+; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[4:5], off
+; UNROLL3-NEXT: global_load_dwordx4 v[16:19], v[4:5], off offset:32
+; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, v4, 48
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0
+; UNROLL3-NEXT: s_addc_u32 s5, s5, -1
; UNROLL3-NEXT: s_waitcnt vmcnt(2)
-; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7] offset:16
+; UNROLL3-NEXT: flat_store_dwordx4 v[6:7], v[8:11] offset:16
; UNROLL3-NEXT: s_waitcnt vmcnt(1)
-; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
+; UNROLL3-NEXT: flat_store_dwordx4 v[6:7], v[12:15]
; UNROLL3-NEXT: s_waitcnt vmcnt(0)
-; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32
-; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0x7e0
+; UNROLL3-NEXT: flat_store_dwordx4 v[6:7], v[16:19] offset:32
+; UNROLL3-NEXT: v_add_co_u32 v6, vcc_lo, v6, 48
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
+; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0
; UNROLL3-NEXT: s_cbranch_scc1 .LBB7_2
; UNROLL3-NEXT: ; %bb.3: ; %memmove_fwd_residual
; UNROLL3-NEXT: s_clause 0x1
@@ -9676,44 +9696,45 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:2016
; UNROLL3-NEXT: s_waitcnt vmcnt(0)
; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:2032
-; UNROLL3-NEXT: ; implicit-def: $vgpr0_vgpr1
-; UNROLL3-NEXT: .LBB7_4: ; %Flow4
-; UNROLL3-NEXT: s_andn2_saveexec_b32 s8, s6
+; UNROLL3-NEXT: ; implicit-def: $vgpr0
+; UNROLL3-NEXT: .LBB7_4: ; %Flow14
+; UNROLL3-NEXT: s_andn2_saveexec_b32 s6, s6
; UNROLL3-NEXT: s_cbranch_execz .LBB7_7
; UNROLL3-NEXT: ; %bb.5: ; %memmove_bwd_residual
; UNROLL3-NEXT: s_clause 0x1
-; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:2032
-; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:2016
-; UNROLL3-NEXT: s_movk_i32 s6, 0xffd0
-; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7b0
-; UNROLL3-NEXT: s_mov_b32 s7, -1
+; UNROLL3-NEXT: global_load_dwordx4 v[6:9], v[2:3], off offset:2032
+; UNROLL3-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:2016
+; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, 0x7b0, v0
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo
+; UNROLL3-NEXT: s_movk_i32 s4, 0xf820
+; UNROLL3-NEXT: s_mov_b32 s5, -1
; UNROLL3-NEXT: s_waitcnt vmcnt(1)
-; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:2032
+; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[6:9] offset:2032
; UNROLL3-NEXT: s_waitcnt vmcnt(0)
-; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:2016
-; UNROLL3-NEXT: .p2align 6
-; UNROLL3-NEXT: .LBB7_6: ; %memmove_bwd_loop
-; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1
-; UNROLL3-NEXT: v_add_co_u32 v12, vcc_lo, v2, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e64 v13, null, s5, v3, vcc_lo
-; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
-; UNROLL3-NEXT: s_clause 0x2
-; UNROLL3-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16
-; UNROLL3-NEXT: global_load_dwordx4 v[8:11], v[12:13], off
-; UNROLL3-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:32
-; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0
-; UNROLL3-NEXT: s_addc_u32 s5, s5, -1
+; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[10:13] offset:2016
+; UNROLL3-NEXT: .p2align 6
+; UNROLL3-NEXT: .LBB7_6: ; %memmove_bwd_loop
+; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1
+; UNROLL3-NEXT: s_clause 0x2
+; UNROLL3-NEXT: global_load_dwordx4 v[6:9], v[2:3], off offset:1984
+; UNROLL3-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:1968
+; UNROLL3-NEXT: global_load_dwordx4 v[14:17], v[2:3], off offset:2000
+; UNROLL3-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffffd0, v2
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
+; UNROLL3-NEXT: s_add_u32 s4, s4, 48
+; UNROLL3-NEXT: s_addc_u32 s5, s5, 0
; UNROLL3-NEXT: s_waitcnt vmcnt(2)
-; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7] offset:16
+; UNROLL3-NEXT: flat_store_dwordx4 v[4:5], v[6:9] offset:16
; UNROLL3-NEXT: s_waitcnt vmcnt(1)
-; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
+; UNROLL3-NEXT: flat_store_dwordx4 v[4:5], v[10:13]
; UNROLL3-NEXT: s_waitcnt vmcnt(0)
-; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32
-; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], s[6:7]
+; UNROLL3-NEXT: flat_store_dwordx4 v[4:5], v[14:17] offset:32
+; UNROLL3-NEXT: v_add_co_u32 v4, vcc_lo, 0xffffffd0, v4
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v5, vcc_lo
+; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], 0
; UNROLL3-NEXT: s_cbranch_scc0 .LBB7_6
-; UNROLL3-NEXT: .LBB7_7: ; %Flow5
-; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; UNROLL3-NEXT: .LBB7_7: ; %Flow15
+; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s6
; UNROLL3-NEXT: s_waitcnt lgkmcnt(0)
; UNROLL3-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -9935,207 +9956,205 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
; CHECK-NEXT: s_andn2_saveexec_b32 s6, s6
; CHECK-NEXT: s_cbranch_execz .LBB8_6
; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader
-; CHECK-NEXT: v_add_nc_u32_e32 v1, 0x700, v1
-; CHECK-NEXT: v_add_nc_u32_e32 v0, 0x700, v0
; CHECK-NEXT: s_movk_i32 s4, 0xf800
; CHECK-NEXT: s_mov_b32 s5, -1
; CHECK-NEXT: .LBB8_5: ; %memmove_bwd_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_clause 0x3e
-; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:252
-; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:248
-; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:244
-; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:240
-; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:236
-; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:232
-; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:228
-; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:224
-; CHECK-NEXT: buffer_load_dword v10, v1, s[0:3], 0 offen offset:220
-; CHECK-NEXT: buffer_load_dword v11, v1, s[0:3], 0 offen offset:216
-; CHECK-NEXT: buffer_load_dword v12, v1, s[0:3], 0 offen offset:212
-; CHECK-NEXT: buffer_load_dword v13, v1, s[0:3], 0 offen offset:208
-; CHECK-NEXT: buffer_load_dword v14, v1, s[0:3], 0 offen offset:204
-; CHECK-NEXT: buffer_load_dword v15, v1, s[0:3], 0 offen offset:200
-; CHECK-NEXT: buffer_load_dword v16, v1, s[0:3], 0 offen offset:196
-; CHECK-NEXT: buffer_load_dword v17, v1, s[0:3], 0 offen offset:192
-; CHECK-NEXT: buffer_load_dword v18, v1, s[0:3], 0 offen offset:188
-; CHECK-NEXT: buffer_load_dword v19, v1, s[0:3], 0 offen offset:184
-; CHECK-NEXT: buffer_load_dword v20, v1, s[0:3], 0 offen offset:180
-; CHECK-NEXT: buffer_load_dword v21, v1, s[0:3], 0 offen offset:176
-; CHECK-NEXT: buffer_load_dword v22, v1, s[0:3], 0 offen offset:172
-; CHECK-NEXT: buffer_load_dword v23, v1, s[0:3], 0 offen offset:168
-; CHECK-NEXT: buffer_load_dword v24, v1, s[0:3], 0 offen offset:164
-; CHECK-NEXT: buffer_load_dword v25, v1, s[0:3], 0 offen offset:160
-; CHECK-NEXT: buffer_load_dword v26, v1, s[0:3], 0 offen offset:156
-; CHECK-NEXT: buffer_load_dword v27, v1, s[0:3], 0 offen offset:152
-; CHECK-NEXT: buffer_load_dword v28, v1, s[0:3], 0 offen offset:148
-; CHECK-NEXT: buffer_load_dword v29, v1, s[0:3], 0 offen offset:144
-; CHECK-NEXT: buffer_load_dword v30, v1, s[0:3], 0 offen offset:140
-; CHECK-NEXT: buffer_load_dword v31, v1, s[0:3], 0 offen offset:136
-; CHECK-NEXT: buffer_load_dword v32, v1, s[0:3], 0 offen offset:132
-; CHECK-NEXT: buffer_load_dword v33, v1, s[0:3], 0 offen offset:128
-; CHECK-NEXT: buffer_load_dword v34, v1, s[0:3], 0 offen offset:124
-; CHECK-NEXT: buffer_load_dword v35, v1, s[0:3], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v36, v1, s[0:3], 0 offen offset:116
-; CHECK-NEXT: buffer_load_dword v37, v1, s[0:3], 0 offen offset:112
-; CHECK-NEXT: buffer_load_dword v38, v1, s[0:3], 0 offen offset:108
-; CHECK-NEXT: buffer_load_dword v39, v1, s[0:3], 0 offen offset:104
-; CHECK-NEXT: buffer_load_dword v48, v1, s[0:3], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v49, v1, s[0:3], 0 offen offset:96
-; CHECK-NEXT: buffer_load_dword v50, v1, s[0:3], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v51, v1, s[0:3], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v52, v1, s[0:3], 0 offen offset:84
-; CHECK-NEXT: buffer_load_dword v53, v1, s[0:3], 0 offen offset:80
-; CHECK-NEXT: buffer_load_dword v54, v1, s[0:3], 0 offen offset:76
-; CHECK-NEXT: buffer_load_dword v55, v1, s[0:3], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v64, v1, s[0:3], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v65, v1, s[0:3], 0 offen offset:64
-; CHECK-NEXT: buffer_load_dword v66, v1, s[0:3], 0 offen offset:60
-; CHECK-NEXT: buffer_load_dword v67, v1, s[0:3], 0 offen offset:56
-; CHECK-NEXT: buffer_load_dword v68, v1, s[0:3], 0 offen offset:52
-; CHECK-NEXT: buffer_load_dword v69, v1, s[0:3], 0 offen offset:48
-; CHECK-NEXT: buffer_load_dword v70, v1, s[0:3], 0 offen offset:44
-; CHECK-NEXT: buffer_load_dword v71, v1, s[0:3], 0 offen offset:40
-; CHECK-NEXT: buffer_load_dword v80, v1, s[0:3], 0 offen offset:36
-; CHECK-NEXT: buffer_load_dword v81, v1, s[0:3], 0 offen offset:32
-; CHECK-NEXT: buffer_load_dword v82, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v83, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT: buffer_load_dword v84, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v85, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v86, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_load_dword v87, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_load_dword v96, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_load_dword v97, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:2044
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:2040
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:2036
+; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:2032
+; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:2028
+; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:2024
+; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:2020
+; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:2016
+; CHECK-NEXT: buffer_load_dword v10, v1, s[0:3], 0 offen offset:2012
+; CHECK-NEXT: buffer_load_dword v11, v1, s[0:3], 0 offen offset:2008
+; CHECK-NEXT: buffer_load_dword v12, v1, s[0:3], 0 offen offset:2004
+; CHECK-NEXT: buffer_load_dword v13, v1, s[0:3], 0 offen offset:2000
+; CHECK-NEXT: buffer_load_dword v14, v1, s[0:3], 0 offen offset:1996
+; CHECK-NEXT: buffer_load_dword v15, v1, s[0:3], 0 offen offset:1992
+; CHECK-NEXT: buffer_load_dword v16, v1, s[0:3], 0 offen offset:1988
+; CHECK-NEXT: buffer_load_dword v17, v1, s[0:3], 0 offen offset:1984
+; CHECK-NEXT: buffer_load_dword v18, v1, s[0:3], 0 offen offset:1980
+; CHECK-NEXT: buffer_load_dword v19, v1, s[0:3], 0 offen offset:1976
+; CHECK-NEXT: buffer_load_dword v20, v1, s[0:3], 0 offen offset:1972
+; CHECK-NEXT: buffer_load_dword v21, v1, s[0:3], 0 offen offset:1968
+; CHECK-NEXT: buffer_load_dword v22, v1, s[0:3], 0 offen offset:1964
+; CHECK-NEXT: buffer_load_dword v23, v1, s[0:3], 0 offen offset:1960
+; CHECK-NEXT: buffer_load_dword v24, v1, s[0:3], 0 offen offset:1956
+; CHECK-NEXT: buffer_load_dword v25, v1, s[0:3], 0 offen offset:1952
+; CHECK-NEXT: buffer_load_dword v26, v1, s[0:3], 0 offen offset:1948
+; CHECK-NEXT: buffer_load_dword v27, v1, s[0:3], 0 offen offset:1944
+; CHECK-NEXT: buffer_load_dword v28, v1, s[0:3], 0 offen offset:1940
+; CHECK-NEXT: buffer_load_dword v29, v1, s[0:3], 0 offen offset:1936
+; CHECK-NEXT: buffer_load_dword v30, v1, s[0:3], 0 offen offset:1932
+; CHECK-NEXT: buffer_load_dword v31, v1, s[0:3], 0 offen offset:1928
+; CHECK-NEXT: buffer_load_dword v32, v1, s[0:3], 0 offen offset:1924
+; CHECK-NEXT: buffer_load_dword v33, v1, s[0:3], 0 offen offset:1920
+; CHECK-NEXT: buffer_load_dword v34, v1, s[0:3], 0 offen offset:1916
+; CHECK-NEXT: buffer_load_dword v35, v1, s[0:3], 0 offen offset:1912
+; CHECK-NEXT: buffer_load_dword v36, v1, s[0:3], 0 offen offset:1908
+; CHECK-NEXT: buffer_load_dword v37, v1, s[0:3], 0 offen offset:1904
+; CHECK-NEXT: buffer_load_dword v38, v1, s[0:3], 0 offen offset:1900
+; CHECK-NEXT: buffer_load_dword v39, v1, s[0:3], 0 offen offset:1896
+; CHECK-NEXT: buffer_load_dword v48, v1, s[0:3], 0 offen offset:1892
+; CHECK-NEXT: buffer_load_dword v49, v1, s[0:3], 0 offen offset:1888
+; CHECK-NEXT: buffer_load_dword v50, v1, s[0:3], 0 offen offset:1884
+; CHECK-NEXT: buffer_load_dword v51, v1, s[0:3], 0 offen offset:1880
+; CHECK-NEXT: buffer_load_dword v52, v1, s[0:3], 0 offen offset:1876
+; CHECK-NEXT: buffer_load_dword v53, v1, s[0:3], 0 offen offset:1872
+; CHECK-NEXT: buffer_load_dword v54, v1, s[0:3], 0 offen offset:1868
+; CHECK-NEXT: buffer_load_dword v55, v1, s[0:3], 0 offen offset:1864
+; CHECK-NEXT: buffer_load_dword v64, v1, s[0:3], 0 offen offset:1860
+; CHECK-NEXT: buffer_load_dword v65, v1, s[0:3], 0 offen offset:1856
+; CHECK-NEXT: buffer_load_dword v66, v1, s[0:3], 0 offen offset:1852
+; CHECK-NEXT: buffer_load_dword v67, v1, s[0:3], 0 offen offset:1848
+; CHECK-NEXT: buffer_load_dword v68, v1, s[0:3], 0 offen offset:1844
+; CHECK-NEXT: buffer_load_dword v69, v1, s[0:3], 0 offen offset:1840
+; CHECK-NEXT: buffer_load_dword v70, v1, s[0:3], 0 offen offset:1836
+; CHECK-NEXT: buffer_load_dword v71, v1, s[0:3], 0 offen offset:1832
+; CHECK-NEXT: buffer_load_dword v80, v1, s[0:3], 0 offen offset:1828
+; CHECK-NEXT: buffer_load_dword v81, v1, s[0:3], 0 offen offset:1824
+; CHECK-NEXT: buffer_load_dword v82, v1, s[0:3], 0 offen offset:1820
+; CHECK-NEXT: buffer_load_dword v83, v1, s[0:3], 0 offen offset:1816
+; CHECK-NEXT: buffer_load_dword v84, v1, s[0:3], 0 offen offset:1812
+; CHECK-NEXT: buffer_load_dword v85, v1, s[0:3], 0 offen offset:1808
+; CHECK-NEXT: buffer_load_dword v86, v1, s[0:3], 0 offen offset:1804
+; CHECK-NEXT: buffer_load_dword v87, v1, s[0:3], 0 offen offset:1800
+; CHECK-NEXT: buffer_load_dword v96, v1, s[0:3], 0 offen offset:1796
+; CHECK-NEXT: buffer_load_dword v97, v1, s[0:3], 0 offen offset:1792
; CHECK-NEXT: v_add_nc_u32_e32 v1, 0xffffff00, v1
; CHECK-NEXT: s_add_u32 s4, s4, 0x100
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: s_waitcnt vmcnt(62)
-; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:252
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:248
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:2044
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:2040
; CHECK-NEXT: s_waitcnt vmcnt(61)
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:244
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:2036
; CHECK-NEXT: s_waitcnt vmcnt(60)
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:240
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:2032
; CHECK-NEXT: s_waitcnt vmcnt(59)
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:236
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:2028
; CHECK-NEXT: s_waitcnt vmcnt(58)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:232
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:2024
; CHECK-NEXT: s_waitcnt vmcnt(57)
-; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:228
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:2020
; CHECK-NEXT: s_waitcnt vmcnt(56)
-; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:224
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:2016
; CHECK-NEXT: s_waitcnt vmcnt(55)
-; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:220
+; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:2012
; CHECK-NEXT: s_waitcnt vmcnt(54)
-; CHECK-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216
+; CHECK-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:2008
; CHECK-NEXT: s_waitcnt vmcnt(53)
-; CHECK-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:212
+; CHECK-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:2004
; CHECK-NEXT: s_waitcnt vmcnt(52)
-; CHECK-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:208
+; CHECK-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:2000
; CHECK-NEXT: s_waitcnt vmcnt(51)
-; CHECK-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:204
+; CHECK-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:1996
; CHECK-NEXT: s_waitcnt vmcnt(50)
-; CHECK-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:200
+; CHECK-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:1992
; CHECK-NEXT: s_waitcnt vmcnt(49)
-; CHECK-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:196
+; CHECK-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:1988
; CHECK-NEXT: s_waitcnt vmcnt(48)
-; CHECK-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:192
+; CHECK-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:1984
; CHECK-NEXT: s_waitcnt vmcnt(47)
-; CHECK-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:188
+; CHECK-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:1980
; CHECK-NEXT: s_waitcnt vmcnt(46)
-; CHECK-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:184
+; CHECK-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:1976
; CHECK-NEXT: s_waitcnt vmcnt(45)
-; CHECK-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:180
+; CHECK-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:1972
; CHECK-NEXT: s_waitcnt vmcnt(44)
-; CHECK-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:176
+; CHECK-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:1968
; CHECK-NEXT: s_waitcnt vmcnt(43)
-; CHECK-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:172
+; CHECK-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:1964
; CHECK-NEXT: s_waitcnt vmcnt(42)
-; CHECK-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:168
+; CHECK-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:1960
; CHECK-NEXT: s_waitcnt vmcnt(41)
-; CHECK-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:164
+; CHECK-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:1956
; CHECK-NEXT: s_waitcnt vmcnt(40)
-; CHECK-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:160
+; CHECK-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:1952
; CHECK-NEXT: s_waitcnt vmcnt(39)
-; CHECK-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:156
+; CHECK-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:1948
; CHECK-NEXT: s_waitcnt vmcnt(38)
-; CHECK-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:152
+; CHECK-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:1944
; CHECK-NEXT: s_waitcnt vmcnt(37)
-; CHECK-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:148
+; CHECK-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:1940
; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:144
+; CHECK-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:1936
; CHECK-NEXT: s_waitcnt vmcnt(35)
-; CHECK-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:140
+; CHECK-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:1932
; CHECK-NEXT: s_waitcnt vmcnt(34)
-; CHECK-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:136
+; CHECK-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:1928
; CHECK-NEXT: s_waitcnt vmcnt(33)
-; CHECK-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:132
+; CHECK-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:1924
; CHECK-NEXT: s_waitcnt vmcnt(32)
-; CHECK-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128
+; CHECK-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:1920
; CHECK-NEXT: s_waitcnt vmcnt(31)
-; CHECK-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:124
+; CHECK-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:1916
; CHECK-NEXT: s_waitcnt vmcnt(30)
-; CHECK-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:120
+; CHECK-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:1912
; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:116
+; CHECK-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:1908
; CHECK-NEXT: s_waitcnt vmcnt(28)
-; CHECK-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:112
+; CHECK-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:1904
; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:108
+; CHECK-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:1900
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:104
+; CHECK-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:1896
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:100
+; CHECK-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:1892
; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:96
+; CHECK-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:1888
; CHECK-NEXT: s_waitcnt vmcnt(23)
-; CHECK-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:92
+; CHECK-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:1884
; CHECK-NEXT: s_waitcnt vmcnt(22)
-; CHECK-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:88
+; CHECK-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:1880
; CHECK-NEXT: s_waitcnt vmcnt(21)
-; CHECK-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:84
+; CHECK-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:1876
; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen offset:80
+; CHECK-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen offset:1872
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:76
+; CHECK-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:1868
; CHECK-NEXT: s_waitcnt vmcnt(18)
-; CHECK-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:72
+; CHECK-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:1864
; CHECK-NEXT: s_waitcnt vmcnt(17)
-; CHECK-NEXT: buffer_store_dword v64, v0, s[0:3], 0 offen offset:68
+; CHECK-NEXT: buffer_store_dword v64, v0, s[0:3], 0 offen offset:1860
; CHECK-NEXT: s_waitcnt vmcnt(16)
-; CHECK-NEXT: buffer_store_dword v65, v0, s[0:3], 0 offen offset:64
+; CHECK-NEXT: buffer_store_dword v65, v0, s[0:3], 0 offen offset:1856
; CHECK-NEXT: s_waitcnt vmcnt(15)
-; CHECK-NEXT: buffer_store_dword v66, v0, s[0:3], 0 offen offset:60
+; CHECK-NEXT: buffer_store_dword v66, v0, s[0:3], 0 offen offset:1852
; CHECK-NEXT: s_waitcnt vmcnt(14)
-; CHECK-NEXT: buffer_store_dword v67, v0, s[0:3], 0 offen offset:56
+; CHECK-NEXT: buffer_store_dword v67, v0, s[0:3], 0 offen offset:1848
; CHECK-NEXT: s_waitcnt vmcnt(13)
-; CHECK-NEXT: buffer_store_dword v68, v0, s[0:3], 0 offen offset:52
+; CHECK-NEXT: buffer_store_dword v68, v0, s[0:3], 0 offen offset:1844
; CHECK-NEXT: s_waitcnt vmcnt(12)
-; CHECK-NEXT: buffer_store_dword v69, v0, s[0:3], 0 offen offset:48
+; CHECK-NEXT: buffer_store_dword v69, v0, s[0:3], 0 offen offset:1840
; CHECK-NEXT: s_waitcnt vmcnt(11)
-; CHECK-NEXT: buffer_store_dword v70, v0, s[0:3], 0 offen offset:44
+; CHECK-NEXT: buffer_store_dword v70, v0, s[0:3], 0 offen offset:1836
; CHECK-NEXT: s_waitcnt vmcnt(10)
-; CHECK-NEXT: buffer_store_dword v71, v0, s[0:3], 0 offen offset:40
+; CHECK-NEXT: buffer_store_dword v71, v0, s[0:3], 0 offen offset:1832
; CHECK-NEXT: s_waitcnt vmcnt(9)
-; CHECK-NEXT: buffer_store_dword v80, v0, s[0:3], 0 offen offset:36
+; CHECK-NEXT: buffer_store_dword v80, v0, s[0:3], 0 offen offset:1828
; CHECK-NEXT: s_waitcnt vmcnt(8)
-; CHECK-NEXT: buffer_store_dword v81, v0, s[0:3], 0 offen offset:32
+; CHECK-NEXT: buffer_store_dword v81, v0, s[0:3], 0 offen offset:1824
; CHECK-NEXT: s_waitcnt vmcnt(7)
-; CHECK-NEXT: buffer_store_dword v82, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v82, v0, s[0:3], 0 offen offset:1820
; CHECK-NEXT: s_waitcnt vmcnt(6)
-; CHECK-NEXT: buffer_store_dword v83, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v83, v0, s[0:3], 0 offen offset:1816
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: buffer_store_dword v84, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v84, v0, s[0:3], 0 offen offset:1812
; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: buffer_store_dword v85, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v85, v0, s[0:3], 0 offen offset:1808
; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v86, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v86, v0, s[0:3], 0 offen offset:1804
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_dword v87, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v87, v0, s[0:3], 0 offen offset:1800
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_dword v96, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v96, v0, s[0:3], 0 offen offset:1796
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_dword v97, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v97, v0, s[0:3], 0 offen offset:1792
; CHECK-NEXT: v_add_nc_u32_e32 v0, 0xffffff00, v0
; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB8_5
@@ -11251,1055 +11270,1053 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_andn2_saveexec_b32 s6, s6
; ALIGNED-NEXT: s_cbranch_execz .LBB8_6
; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader
-; ALIGNED-NEXT: v_add_nc_u32_e32 v1, 0x700, v1
-; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x700, v0
; ALIGNED-NEXT: s_movk_i32 s4, 0xf800
; ALIGNED-NEXT: s_mov_b32 s5, -1
; ALIGNED-NEXT: .LBB8_5: ; %memmove_bwd_loop
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2047
; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100
; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], 0
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2046
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2045
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2044
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:251
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2043
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2042
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2041
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2040
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:247
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2039
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2038
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2037
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2036
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2035
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2034
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2033
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2032
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2031
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2030
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2029
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2028
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2027
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2026
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2025
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2024
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2023
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2022
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2021
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2020
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2019
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2018
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2017
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2016
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2015
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2014
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:213
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:212
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:210
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:203
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x3e
-; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:199
-; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:198
-; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:197
-; ALIGNED-NEXT: buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:196
-; ALIGNED-NEXT: buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT: buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:194
-; ALIGNED-NEXT: buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:193
-; ALIGNED-NEXT: buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT: buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT: buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:190
-; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:189
-; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:188
-; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:187
-; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:186
-; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT: buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT: buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:183
-; ALIGNED-NEXT: buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:182
-; ALIGNED-NEXT: buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:181
-; ALIGNED-NEXT: buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:180
-; ALIGNED-NEXT: buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:179
-; ALIGNED-NEXT: buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:178
-; ALIGNED-NEXT: buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:177
-; ALIGNED-NEXT: buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:176
-; ALIGNED-NEXT: buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT: buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:174
-; ALIGNED-NEXT: buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT: buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT: buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT: buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:170
-; ALIGNED-NEXT: buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT: buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT: buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT: buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT: buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT: buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT: buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT: buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT: buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT: buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT: buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:159
-; ALIGNED-NEXT: buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:158
-; ALIGNED-NEXT: buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:157
-; ALIGNED-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:156
-; ALIGNED-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:155
-; ALIGNED-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:151
-; ALIGNED-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:150
-; ALIGNED-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:149
-; ALIGNED-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:148
-; ALIGNED-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:147
-; ALIGNED-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:146
-; ALIGNED-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:145
-; ALIGNED-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:144
-; ALIGNED-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:143
-; ALIGNED-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:142
-; ALIGNED-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:141
-; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:140
-; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:139
-; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:138
-; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:137
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2013
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2012
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2011
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2010
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2009
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2008
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2007
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2006
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2005
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2004
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2003
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2002
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2001
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2000
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1999
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1998
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1997
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1996
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1995
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1994
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1993
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1992
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x3e
+; ALIGNED-NEXT: buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:1991
+; ALIGNED-NEXT: buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:1990
+; ALIGNED-NEXT: buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:1989
+; ALIGNED-NEXT: buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:1988
+; ALIGNED-NEXT: buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:1987
+; ALIGNED-NEXT: buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:1986
+; ALIGNED-NEXT: buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:1985
+; ALIGNED-NEXT: buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:1984
+; ALIGNED-NEXT: buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:1983
+; ALIGNED-NEXT: buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:1982
+; ALIGNED-NEXT: buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:1981
+; ALIGNED-NEXT: buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:1980
+; ALIGNED-NEXT: buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:1979
+; ALIGNED-NEXT: buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:1978
+; ALIGNED-NEXT: buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:1977
+; ALIGNED-NEXT: buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:1976
+; ALIGNED-NEXT: buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:1975
+; ALIGNED-NEXT: buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:1974
+; ALIGNED-NEXT: buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:1973
+; ALIGNED-NEXT: buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:1972
+; ALIGNED-NEXT: buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:1971
+; ALIGNED-NEXT: buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:1970
+; ALIGNED-NEXT: buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:1969
+; ALIGNED-NEXT: buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:1968
+; ALIGNED-NEXT: buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:1967
+; ALIGNED-NEXT: buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:1966
+; ALIGNED-NEXT: buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:1965
+; ALIGNED-NEXT: buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:1964
+; ALIGNED-NEXT: buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:1963
+; ALIGNED-NEXT: buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:1962
+; ALIGNED-NEXT: buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:1961
+; ALIGNED-NEXT: buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:1960
+; ALIGNED-NEXT: buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:1959
+; ALIGNED-NEXT: buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:1958
+; ALIGNED-NEXT: buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:1957
+; ALIGNED-NEXT: buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:1956
+; ALIGNED-NEXT: buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:1955
+; ALIGNED-NEXT: buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:1954
+; ALIGNED-NEXT: buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:1953
+; ALIGNED-NEXT: buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:1952
+; ALIGNED-NEXT: buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:1951
+; ALIGNED-NEXT: buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:1950
+; ALIGNED-NEXT: buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:1949
+; ALIGNED-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:1948
+; ALIGNED-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:1947
+; ALIGNED-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:1946
+; ALIGNED-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:1945
+; ALIGNED-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:1944
+; ALIGNED-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:1943
+; ALIGNED-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:1942
+; ALIGNED-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:1941
+; ALIGNED-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:1940
+; ALIGNED-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:1939
+; ALIGNED-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:1938
+; ALIGNED-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:1937
+; ALIGNED-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:1936
+; ALIGNED-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:1935
+; ALIGNED-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:1934
+; ALIGNED-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:1933
+; ALIGNED-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1932
+; ALIGNED-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:1931
+; ALIGNED-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:1930
+; ALIGNED-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:1929
; ALIGNED-NEXT: s_clause 0xa
-; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:136
-; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:135
-; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:134
-; ALIGNED-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:133
-; ALIGNED-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:132
-; ALIGNED-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:131
-; ALIGNED-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:130
-; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:129
-; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:128
-; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:127
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126
+; ALIGNED-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:1928
+; ALIGNED-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:1927
+; ALIGNED-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:1926
+; ALIGNED-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:1925
+; ALIGNED-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:1924
+; ALIGNED-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:1923
+; ALIGNED-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:1922
+; ALIGNED-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:1921
+; ALIGNED-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:1920
+; ALIGNED-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1919
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1918
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x34
-; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:125
-; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:124
-; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:123
-; ALIGNED-NEXT: buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:122
-; ALIGNED-NEXT: buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:121
-; ALIGNED-NEXT: buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:120
-; ALIGNED-NEXT: buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:119
-; ALIGNED-NEXT: buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:118
-; ALIGNED-NEXT: buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:117
-; ALIGNED-NEXT: buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:116
-; ALIGNED-NEXT: buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:115
-; ALIGNED-NEXT: buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:114
-; ALIGNED-NEXT: buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:113
-; ALIGNED-NEXT: buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:112
-; ALIGNED-NEXT: buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:111
-; ALIGNED-NEXT: buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:110
-; ALIGNED-NEXT: buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:109
-; ALIGNED-NEXT: buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:108
-; ALIGNED-NEXT: buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:107
-; ALIGNED-NEXT: buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:106
-; ALIGNED-NEXT: buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:105
-; ALIGNED-NEXT: buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:104
-; ALIGNED-NEXT: buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:103
-; ALIGNED-NEXT: buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:102
-; ALIGNED-NEXT: buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:101
-; ALIGNED-NEXT: buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:100
-; ALIGNED-NEXT: buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:99
-; ALIGNED-NEXT: buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:98
-; ALIGNED-NEXT: buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:97
-; ALIGNED-NEXT: buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:96
-; ALIGNED-NEXT: buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:95
-; ALIGNED-NEXT: buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:94
-; ALIGNED-NEXT: buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:93
-; ALIGNED-NEXT: buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:92
-; ALIGNED-NEXT: buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:91
-; ALIGNED-NEXT: buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:90
-; ALIGNED-NEXT: buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:89
-; ALIGNED-NEXT: buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:88
-; ALIGNED-NEXT: buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:87
-; ALIGNED-NEXT: buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:86
-; ALIGNED-NEXT: buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:85
-; ALIGNED-NEXT: buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:84
-; ALIGNED-NEXT: buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:83
-; ALIGNED-NEXT: buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:82
-; ALIGNED-NEXT: buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:81
-; ALIGNED-NEXT: buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:80
-; ALIGNED-NEXT: buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT: buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:78
-; ALIGNED-NEXT: buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:77
-; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:76
-; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:75
-; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:74
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73
+; ALIGNED-NEXT: buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:1917
+; ALIGNED-NEXT: buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:1916
+; ALIGNED-NEXT: buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:1915
+; ALIGNED-NEXT: buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:1914
+; ALIGNED-NEXT: buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:1913
+; ALIGNED-NEXT: buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:1912
+; ALIGNED-NEXT: buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:1911
+; ALIGNED-NEXT: buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:1910
+; ALIGNED-NEXT: buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:1909
+; ALIGNED-NEXT: buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:1908
+; ALIGNED-NEXT: buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:1907
+; ALIGNED-NEXT: buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:1906
+; ALIGNED-NEXT: buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:1905
+; ALIGNED-NEXT: buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:1904
+; ALIGNED-NEXT: buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:1903
+; ALIGNED-NEXT: buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:1902
+; ALIGNED-NEXT: buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:1901
+; ALIGNED-NEXT: buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:1900
+; ALIGNED-NEXT: buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:1899
+; ALIGNED-NEXT: buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:1898
+; ALIGNED-NEXT: buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:1897
+; ALIGNED-NEXT: buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:1896
+; ALIGNED-NEXT: buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:1895
+; ALIGNED-NEXT: buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:1894
+; ALIGNED-NEXT: buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:1893
+; ALIGNED-NEXT: buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:1892
+; ALIGNED-NEXT: buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:1891
+; ALIGNED-NEXT: buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:1890
+; ALIGNED-NEXT: buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:1889
+; ALIGNED-NEXT: buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:1888
+; ALIGNED-NEXT: buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:1887
+; ALIGNED-NEXT: buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:1886
+; ALIGNED-NEXT: buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:1885
+; ALIGNED-NEXT: buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:1884
+; ALIGNED-NEXT: buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:1883
+; ALIGNED-NEXT: buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:1882
+; ALIGNED-NEXT: buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:1881
+; ALIGNED-NEXT: buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:1880
+; ALIGNED-NEXT: buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:1879
+; ALIGNED-NEXT: buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:1878
+; ALIGNED-NEXT: buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:1877
+; ALIGNED-NEXT: buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:1876
+; ALIGNED-NEXT: buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:1875
+; ALIGNED-NEXT: buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:1874
+; ALIGNED-NEXT: buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:1873
+; ALIGNED-NEXT: buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:1872
+; ALIGNED-NEXT: buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:1871
+; ALIGNED-NEXT: buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:1870
+; ALIGNED-NEXT: buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:1869
+; ALIGNED-NEXT: buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:1868
+; ALIGNED-NEXT: buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:1867
+; ALIGNED-NEXT: buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:1866
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1865
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:72
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1864
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:71
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1863
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:70
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1862
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:69
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1861
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:68
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1860
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:67
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1859
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:66
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1858
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:65
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1857
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:64
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1856
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:63
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1855
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:62
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1854
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:61
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1853
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:60
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1852
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:59
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1851
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1850
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1849
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1848
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:55
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1847
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:54
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1846
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:53
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1845
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:52
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1844
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:51
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1843
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:50
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1842
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:49
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1841
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:48
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1840
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:47
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1839
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:46
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1838
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:45
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1837
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:44
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1836
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:43
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1835
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:42
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1834
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:41
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1833
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:40
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1832
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:39
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1831
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:38
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1830
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:37
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1829
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:36
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1828
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:35
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1827
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:34
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1826
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:33
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1825
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:32
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1824
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1823
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1822
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:29
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1821
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:28
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1820
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1819
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1818
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1817
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1816
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:23
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1815
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:22
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1814
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:21
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1813
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:20
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1812
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1811
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:18
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1810
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1809
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:16
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1808
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1807
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1806
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1805
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1804
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:11
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1803
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1802
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1801
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1800
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1799
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1798
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1797
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1796
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1795
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1794
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1793
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen
+; ALIGNED-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1792
; ALIGNED-NEXT: v_add_nc_u32_e32 v1, 0xffffff00, v1
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2047
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2046
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2045
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2044
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:251
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2043
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2042
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2041
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2040
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:247
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2039
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2038
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2037
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2036
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2035
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2034
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2033
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2032
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2031
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2030
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2029
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2028
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2027
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2026
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2025
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2024
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2023
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2022
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2021
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2020
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2019
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2018
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2017
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2016
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2015
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2014
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2013
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:213
-; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:212
-; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:210
-; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:203
-; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT: buffer_store_byte v115, v0, s[0:3], 0 offen offset:199
-; ALIGNED-NEXT: buffer_store_byte v114, v0, s[0:3], 0 offen offset:198
-; ALIGNED-NEXT: buffer_store_byte v113, v0, s[0:3], 0 offen offset:197
-; ALIGNED-NEXT: buffer_store_byte v112, v0, s[0:3], 0 offen offset:196
-; ALIGNED-NEXT: buffer_store_byte v103, v0, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT: buffer_store_byte v102, v0, s[0:3], 0 offen offset:194
-; ALIGNED-NEXT: buffer_store_byte v101, v0, s[0:3], 0 offen offset:193
-; ALIGNED-NEXT: buffer_store_byte v100, v0, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT: buffer_store_byte v99, v0, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT: buffer_store_byte v98, v0, s[0:3], 0 offen offset:190
-; ALIGNED-NEXT: buffer_store_byte v97, v0, s[0:3], 0 offen offset:189
-; ALIGNED-NEXT: buffer_store_byte v96, v0, s[0:3], 0 offen offset:188
-; ALIGNED-NEXT: buffer_store_byte v87, v0, s[0:3], 0 offen offset:187
-; ALIGNED-NEXT: buffer_store_byte v86, v0, s[0:3], 0 offen offset:186
-; ALIGNED-NEXT: buffer_store_byte v85, v0, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT: buffer_store_byte v84, v0, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT: buffer_store_byte v83, v0, s[0:3], 0 offen offset:183
-; ALIGNED-NEXT: buffer_store_byte v82, v0, s[0:3], 0 offen offset:182
-; ALIGNED-NEXT: buffer_store_byte v81, v0, s[0:3], 0 offen offset:181
-; ALIGNED-NEXT: buffer_store_byte v80, v0, s[0:3], 0 offen offset:180
-; ALIGNED-NEXT: buffer_store_byte v71, v0, s[0:3], 0 offen offset:179
-; ALIGNED-NEXT: buffer_store_byte v70, v0, s[0:3], 0 offen offset:178
-; ALIGNED-NEXT: buffer_store_byte v69, v0, s[0:3], 0 offen offset:177
-; ALIGNED-NEXT: buffer_store_byte v68, v0, s[0:3], 0 offen offset:176
-; ALIGNED-NEXT: buffer_store_byte v67, v0, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT: buffer_store_byte v66, v0, s[0:3], 0 offen offset:174
-; ALIGNED-NEXT: buffer_store_byte v65, v0, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT: buffer_store_byte v64, v0, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT: buffer_store_byte v55, v0, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT: buffer_store_byte v54, v0, s[0:3], 0 offen offset:170
-; ALIGNED-NEXT: buffer_store_byte v53, v0, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT: buffer_store_byte v52, v0, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT: buffer_store_byte v51, v0, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT: buffer_store_byte v50, v0, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT: buffer_store_byte v49, v0, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT: buffer_store_byte v48, v0, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT: buffer_store_byte v39, v0, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT: buffer_store_byte v38, v0, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT: buffer_store_byte v37, v0, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT: buffer_store_byte v36, v0, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT: buffer_store_byte v35, v0, s[0:3], 0 offen offset:159
-; ALIGNED-NEXT: buffer_store_byte v34, v0, s[0:3], 0 offen offset:158
-; ALIGNED-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:157
-; ALIGNED-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:156
-; ALIGNED-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:155
-; ALIGNED-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:151
-; ALIGNED-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:150
-; ALIGNED-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:149
-; ALIGNED-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:148
-; ALIGNED-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:147
-; ALIGNED-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:146
-; ALIGNED-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:145
-; ALIGNED-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:144
-; ALIGNED-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:143
-; ALIGNED-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:142
-; ALIGNED-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:141
-; ALIGNED-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:140
-; ALIGNED-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:139
-; ALIGNED-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:138
-; ALIGNED-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:137
-; ALIGNED-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:136
-; ALIGNED-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:135
-; ALIGNED-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:134
-; ALIGNED-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:133
-; ALIGNED-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:132
-; ALIGNED-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:131
-; ALIGNED-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:130
-; ALIGNED-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:129
-; ALIGNED-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:128
-; ALIGNED-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:127
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2012
+; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2011
+; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2010
+; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2009
+; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2008
+; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2007
+; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2006
+; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2005
+; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2004
+; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2003
+; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2002
+; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2001
+; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2000
+; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1999
+; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1998
+; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1997
+; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1996
+; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1995
+; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1994
+; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1993
+; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1992
+; ALIGNED-NEXT: buffer_store_byte v115, v0, s[0:3], 0 offen offset:1991
+; ALIGNED-NEXT: buffer_store_byte v114, v0, s[0:3], 0 offen offset:1990
+; ALIGNED-NEXT: buffer_store_byte v113, v0, s[0:3], 0 offen offset:1989
+; ALIGNED-NEXT: buffer_store_byte v112, v0, s[0:3], 0 offen offset:1988
+; ALIGNED-NEXT: buffer_store_byte v103, v0, s[0:3], 0 offen offset:1987
+; ALIGNED-NEXT: buffer_store_byte v102, v0, s[0:3], 0 offen offset:1986
+; ALIGNED-NEXT: buffer_store_byte v101, v0, s[0:3], 0 offen offset:1985
+; ALIGNED-NEXT: buffer_store_byte v100, v0, s[0:3], 0 offen offset:1984
+; ALIGNED-NEXT: buffer_store_byte v99, v0, s[0:3], 0 offen offset:1983
+; ALIGNED-NEXT: buffer_store_byte v98, v0, s[0:3], 0 offen offset:1982
+; ALIGNED-NEXT: buffer_store_byte v97, v0, s[0:3], 0 offen offset:1981
+; ALIGNED-NEXT: buffer_store_byte v96, v0, s[0:3], 0 offen offset:1980
+; ALIGNED-NEXT: buffer_store_byte v87, v0, s[0:3], 0 offen offset:1979
+; ALIGNED-NEXT: buffer_store_byte v86, v0, s[0:3], 0 offen offset:1978
+; ALIGNED-NEXT: buffer_store_byte v85, v0, s[0:3], 0 offen offset:1977
+; ALIGNED-NEXT: buffer_store_byte v84, v0, s[0:3], 0 offen offset:1976
+; ALIGNED-NEXT: buffer_store_byte v83, v0, s[0:3], 0 offen offset:1975
+; ALIGNED-NEXT: buffer_store_byte v82, v0, s[0:3], 0 offen offset:1974
+; ALIGNED-NEXT: buffer_store_byte v81, v0, s[0:3], 0 offen offset:1973
+; ALIGNED-NEXT: buffer_store_byte v80, v0, s[0:3], 0 offen offset:1972
+; ALIGNED-NEXT: buffer_store_byte v71, v0, s[0:3], 0 offen offset:1971
+; ALIGNED-NEXT: buffer_store_byte v70, v0, s[0:3], 0 offen offset:1970
+; ALIGNED-NEXT: buffer_store_byte v69, v0, s[0:3], 0 offen offset:1969
+; ALIGNED-NEXT: buffer_store_byte v68, v0, s[0:3], 0 offen offset:1968
+; ALIGNED-NEXT: buffer_store_byte v67, v0, s[0:3], 0 offen offset:1967
+; ALIGNED-NEXT: buffer_store_byte v66, v0, s[0:3], 0 offen offset:1966
+; ALIGNED-NEXT: buffer_store_byte v65, v0, s[0:3], 0 offen offset:1965
+; ALIGNED-NEXT: buffer_store_byte v64, v0, s[0:3], 0 offen offset:1964
+; ALIGNED-NEXT: buffer_store_byte v55, v0, s[0:3], 0 offen offset:1963
+; ALIGNED-NEXT: buffer_store_byte v54, v0, s[0:3], 0 offen offset:1962
+; ALIGNED-NEXT: buffer_store_byte v53, v0, s[0:3], 0 offen offset:1961
+; ALIGNED-NEXT: buffer_store_byte v52, v0, s[0:3], 0 offen offset:1960
+; ALIGNED-NEXT: buffer_store_byte v51, v0, s[0:3], 0 offen offset:1959
+; ALIGNED-NEXT: buffer_store_byte v50, v0, s[0:3], 0 offen offset:1958
+; ALIGNED-NEXT: buffer_store_byte v49, v0, s[0:3], 0 offen offset:1957
+; ALIGNED-NEXT: buffer_store_byte v48, v0, s[0:3], 0 offen offset:1956
+; ALIGNED-NEXT: buffer_store_byte v39, v0, s[0:3], 0 offen offset:1955
+; ALIGNED-NEXT: buffer_store_byte v38, v0, s[0:3], 0 offen offset:1954
+; ALIGNED-NEXT: buffer_store_byte v37, v0, s[0:3], 0 offen offset:1953
+; ALIGNED-NEXT: buffer_store_byte v36, v0, s[0:3], 0 offen offset:1952
+; ALIGNED-NEXT: buffer_store_byte v35, v0, s[0:3], 0 offen offset:1951
+; ALIGNED-NEXT: buffer_store_byte v34, v0, s[0:3], 0 offen offset:1950
+; ALIGNED-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:1949
+; ALIGNED-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:1948
+; ALIGNED-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:1947
+; ALIGNED-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:1946
+; ALIGNED-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:1945
+; ALIGNED-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:1944
+; ALIGNED-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:1943
+; ALIGNED-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:1942
+; ALIGNED-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:1941
+; ALIGNED-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:1940
+; ALIGNED-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:1939
+; ALIGNED-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:1938
+; ALIGNED-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:1937
+; ALIGNED-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:1936
+; ALIGNED-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:1935
+; ALIGNED-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:1934
+; ALIGNED-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1933
+; ALIGNED-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1932
+; ALIGNED-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:1931
+; ALIGNED-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:1930
+; ALIGNED-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:1929
+; ALIGNED-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:1928
+; ALIGNED-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:1927
+; ALIGNED-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:1926
+; ALIGNED-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:1925
+; ALIGNED-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:1924
+; ALIGNED-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:1923
+; ALIGNED-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:1922
+; ALIGNED-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:1921
+; ALIGNED-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:1920
+; ALIGNED-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:1919
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:126
-; ALIGNED-NEXT: buffer_store_byte v127, v0, s[0:3], 0 offen offset:125
-; ALIGNED-NEXT: buffer_store_byte v126, v0, s[0:3], 0 offen offset:124
-; ALIGNED-NEXT: buffer_store_byte v125, v0, s[0:3], 0 offen offset:123
-; ALIGNED-NEXT: buffer_store_byte v124, v0, s[0:3], 0 offen offset:122
-; ALIGNED-NEXT: buffer_store_byte v123, v0, s[0:3], 0 offen offset:121
-; ALIGNED-NEXT: buffer_store_byte v122, v0, s[0:3], 0 offen offset:120
-; ALIGNED-NEXT: buffer_store_byte v121, v0, s[0:3], 0 offen offset:119
-; ALIGNED-NEXT: buffer_store_byte v120, v0, s[0:3], 0 offen offset:118
-; ALIGNED-NEXT: buffer_store_byte v111, v0, s[0:3], 0 offen offset:117
-; ALIGNED-NEXT: buffer_store_byte v110, v0, s[0:3], 0 offen offset:116
-; ALIGNED-NEXT: buffer_store_byte v109, v0, s[0:3], 0 offen offset:115
-; ALIGNED-NEXT: buffer_store_byte v108, v0, s[0:3], 0 offen offset:114
-; ALIGNED-NEXT: buffer_store_byte v107, v0, s[0:3], 0 offen offset:113
-; ALIGNED-NEXT: buffer_store_byte v106, v0, s[0:3], 0 offen offset:112
-; ALIGNED-NEXT: buffer_store_byte v105, v0, s[0:3], 0 offen offset:111
-; ALIGNED-NEXT: buffer_store_byte v104, v0, s[0:3], 0 offen offset:110
-; ALIGNED-NEXT: buffer_store_byte v95, v0, s[0:3], 0 offen offset:109
-; ALIGNED-NEXT: buffer_store_byte v94, v0, s[0:3], 0 offen offset:108
-; ALIGNED-NEXT: buffer_store_byte v93, v0, s[0:3], 0 offen offset:107
-; ALIGNED-NEXT: buffer_store_byte v92, v0, s[0:3], 0 offen offset:106
-; ALIGNED-NEXT: buffer_store_byte v91, v0, s[0:3], 0 offen offset:105
-; ALIGNED-NEXT: buffer_store_byte v90, v0, s[0:3], 0 offen offset:104
-; ALIGNED-NEXT: buffer_store_byte v89, v0, s[0:3], 0 offen offset:103
-; ALIGNED-NEXT: buffer_store_byte v88, v0, s[0:3], 0 offen offset:102
-; ALIGNED-NEXT: buffer_store_byte v79, v0, s[0:3], 0 offen offset:101
-; ALIGNED-NEXT: buffer_store_byte v78, v0, s[0:3], 0 offen offset:100
-; ALIGNED-NEXT: buffer_store_byte v77, v0, s[0:3], 0 offen offset:99
-; ALIGNED-NEXT: buffer_store_byte v76, v0, s[0:3], 0 offen offset:98
-; ALIGNED-NEXT: buffer_store_byte v75, v0, s[0:3], 0 offen offset:97
-; ALIGNED-NEXT: buffer_store_byte v74, v0, s[0:3], 0 offen offset:96
-; ALIGNED-NEXT: buffer_store_byte v73, v0, s[0:3], 0 offen offset:95
-; ALIGNED-NEXT: buffer_store_byte v72, v0, s[0:3], 0 offen offset:94
-; ALIGNED-NEXT: buffer_store_byte v63, v0, s[0:3], 0 offen offset:93
-; ALIGNED-NEXT: buffer_store_byte v62, v0, s[0:3], 0 offen offset:92
-; ALIGNED-NEXT: buffer_store_byte v61, v0, s[0:3], 0 offen offset:91
-; ALIGNED-NEXT: buffer_store_byte v60, v0, s[0:3], 0 offen offset:90
-; ALIGNED-NEXT: buffer_store_byte v59, v0, s[0:3], 0 offen offset:89
-; ALIGNED-NEXT: buffer_store_byte v58, v0, s[0:3], 0 offen offset:88
-; ALIGNED-NEXT: buffer_store_byte v57, v0, s[0:3], 0 offen offset:87
-; ALIGNED-NEXT: buffer_store_byte v56, v0, s[0:3], 0 offen offset:86
-; ALIGNED-NEXT: buffer_store_byte v47, v0, s[0:3], 0 offen offset:85
-; ALIGNED-NEXT: buffer_store_byte v46, v0, s[0:3], 0 offen offset:84
-; ALIGNED-NEXT: buffer_store_byte v45, v0, s[0:3], 0 offen offset:83
-; ALIGNED-NEXT: buffer_store_byte v44, v0, s[0:3], 0 offen offset:82
-; ALIGNED-NEXT: buffer_store_byte v43, v0, s[0:3], 0 offen offset:81
-; ALIGNED-NEXT: buffer_store_byte v42, v0, s[0:3], 0 offen offset:80
-; ALIGNED-NEXT: buffer_store_byte v41, v0, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT: buffer_store_byte v40, v0, s[0:3], 0 offen offset:78
-; ALIGNED-NEXT: buffer_store_byte v119, v0, s[0:3], 0 offen offset:77
-; ALIGNED-NEXT: buffer_store_byte v118, v0, s[0:3], 0 offen offset:76
-; ALIGNED-NEXT: buffer_store_byte v117, v0, s[0:3], 0 offen offset:75
-; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:74
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1918
+; ALIGNED-NEXT: buffer_store_byte v127, v0, s[0:3], 0 offen offset:1917
+; ALIGNED-NEXT: buffer_store_byte v126, v0, s[0:3], 0 offen offset:1916
+; ALIGNED-NEXT: buffer_store_byte v125, v0, s[0:3], 0 offen offset:1915
+; ALIGNED-NEXT: buffer_store_byte v124, v0, s[0:3], 0 offen offset:1914
+; ALIGNED-NEXT: buffer_store_byte v123, v0, s[0:3], 0 offen offset:1913
+; ALIGNED-NEXT: buffer_store_byte v122, v0, s[0:3], 0 offen offset:1912
+; ALIGNED-NEXT: buffer_store_byte v121, v0, s[0:3], 0 offen offset:1911
+; ALIGNED-NEXT: buffer_store_byte v120, v0, s[0:3], 0 offen offset:1910
+; ALIGNED-NEXT: buffer_store_byte v111, v0, s[0:3], 0 offen offset:1909
+; ALIGNED-NEXT: buffer_store_byte v110, v0, s[0:3], 0 offen offset:1908
+; ALIGNED-NEXT: buffer_store_byte v109, v0, s[0:3], 0 offen offset:1907
+; ALIGNED-NEXT: buffer_store_byte v108, v0, s[0:3], 0 offen offset:1906
+; ALIGNED-NEXT: buffer_store_byte v107, v0, s[0:3], 0 offen offset:1905
+; ALIGNED-NEXT: buffer_store_byte v106, v0, s[0:3], 0 offen offset:1904
+; ALIGNED-NEXT: buffer_store_byte v105, v0, s[0:3], 0 offen offset:1903
+; ALIGNED-NEXT: buffer_store_byte v104, v0, s[0:3], 0 offen offset:1902
+; ALIGNED-NEXT: buffer_store_byte v95, v0, s[0:3], 0 offen offset:1901
+; ALIGNED-NEXT: buffer_store_byte v94, v0, s[0:3], 0 offen offset:1900
+; ALIGNED-NEXT: buffer_store_byte v93, v0, s[0:3], 0 offen offset:1899
+; ALIGNED-NEXT: buffer_store_byte v92, v0, s[0:3], 0 offen offset:1898
+; ALIGNED-NEXT: buffer_store_byte v91, v0, s[0:3], 0 offen offset:1897
+; ALIGNED-NEXT: buffer_store_byte v90, v0, s[0:3], 0 offen offset:1896
+; ALIGNED-NEXT: buffer_store_byte v89, v0, s[0:3], 0 offen offset:1895
+; ALIGNED-NEXT: buffer_store_byte v88, v0, s[0:3], 0 offen offset:1894
+; ALIGNED-NEXT: buffer_store_byte v79, v0, s[0:3], 0 offen offset:1893
+; ALIGNED-NEXT: buffer_store_byte v78, v0, s[0:3], 0 offen offset:1892
+; ALIGNED-NEXT: buffer_store_byte v77, v0, s[0:3], 0 offen offset:1891
+; ALIGNED-NEXT: buffer_store_byte v76, v0, s[0:3], 0 offen offset:1890
+; ALIGNED-NEXT: buffer_store_byte v75, v0, s[0:3], 0 offen offset:1889
+; ALIGNED-NEXT: buffer_store_byte v74, v0, s[0:3], 0 offen offset:1888
+; ALIGNED-NEXT: buffer_store_byte v73, v0, s[0:3], 0 offen offset:1887
+; ALIGNED-NEXT: buffer_store_byte v72, v0, s[0:3], 0 offen offset:1886
+; ALIGNED-NEXT: buffer_store_byte v63, v0, s[0:3], 0 offen offset:1885
+; ALIGNED-NEXT: buffer_store_byte v62, v0, s[0:3], 0 offen offset:1884
+; ALIGNED-NEXT: buffer_store_byte v61, v0, s[0:3], 0 offen offset:1883
+; ALIGNED-NEXT: buffer_store_byte v60, v0, s[0:3], 0 offen offset:1882
+; ALIGNED-NEXT: buffer_store_byte v59, v0, s[0:3], 0 offen offset:1881
+; ALIGNED-NEXT: buffer_store_byte v58, v0, s[0:3], 0 offen offset:1880
+; ALIGNED-NEXT: buffer_store_byte v57, v0, s[0:3], 0 offen offset:1879
+; ALIGNED-NEXT: buffer_store_byte v56, v0, s[0:3], 0 offen offset:1878
+; ALIGNED-NEXT: buffer_store_byte v47, v0, s[0:3], 0 offen offset:1877
+; ALIGNED-NEXT: buffer_store_byte v46, v0, s[0:3], 0 offen offset:1876
+; ALIGNED-NEXT: buffer_store_byte v45, v0, s[0:3], 0 offen offset:1875
+; ALIGNED-NEXT: buffer_store_byte v44, v0, s[0:3], 0 offen offset:1874
+; ALIGNED-NEXT: buffer_store_byte v43, v0, s[0:3], 0 offen offset:1873
+; ALIGNED-NEXT: buffer_store_byte v42, v0, s[0:3], 0 offen offset:1872
+; ALIGNED-NEXT: buffer_store_byte v41, v0, s[0:3], 0 offen offset:1871
+; ALIGNED-NEXT: buffer_store_byte v40, v0, s[0:3], 0 offen offset:1870
+; ALIGNED-NEXT: buffer_store_byte v119, v0, s[0:3], 0 offen offset:1869
+; ALIGNED-NEXT: buffer_store_byte v118, v0, s[0:3], 0 offen offset:1868
+; ALIGNED-NEXT: buffer_store_byte v117, v0, s[0:3], 0 offen offset:1867
+; ALIGNED-NEXT: buffer_store_byte v116, v0, s[0:3], 0 offen offset:1866
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:73
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1865
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:72
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1864
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:71
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1863
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:70
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1862
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:69
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1861
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:68
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1860
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:67
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1859
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:66
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1858
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:65
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1857
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:64
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1856
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:63
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1855
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:62
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1854
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:61
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1853
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:60
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1852
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:59
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1851
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1850
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1849
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1848
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:55
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1847
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:54
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1846
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:53
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1845
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:52
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1844
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:51
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1843
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:50
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1842
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:49
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1841
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:48
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1840
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:47
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1839
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:46
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1838
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:45
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1837
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:44
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1836
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:43
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1835
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:42
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1834
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:41
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1833
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:40
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1832
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:39
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1831
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:38
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1830
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:37
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1829
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:36
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1828
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:35
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1827
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:34
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1826
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:33
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1825
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:32
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1824
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:31
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1823
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1822
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:29
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1821
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:28
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1820
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:27
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1819
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1818
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1817
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1816
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:23
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1815
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:22
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1814
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:21
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1813
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:20
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1812
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1811
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:18
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1810
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1809
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1808
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1807
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1806
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1805
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1804
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:11
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1803
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1802
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1801
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1800
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1799
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1798
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1797
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1796
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1795
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1794
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1793
; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen
+; ALIGNED-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:1792
; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0xffffff00, v0
; ALIGNED-NEXT: s_cbranch_scc0 .LBB8_5
; ALIGNED-NEXT: .LBB8_6: ; %Flow19
@@ -12461,63 +12478,61 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
; UNROLL3-NEXT: s_waitcnt vmcnt(0)
; UNROLL3-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:2032
; UNROLL3-NEXT: s_clause 0x3
-; UNROLL3-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:2028
-; UNROLL3-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:2024
-; UNROLL3-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:2020
-; UNROLL3-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:2016
-; UNROLL3-NEXT: v_add_nc_u32_e32 v1, 0x7b0, v1
-; UNROLL3-NEXT: v_add_nc_u32_e32 v2, 0x7b0, v0
+; UNROLL3-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:2028
+; UNROLL3-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:2024
+; UNROLL3-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:2020
+; UNROLL3-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:2016
; UNROLL3-NEXT: s_waitcnt vmcnt(3)
-; UNROLL3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:2028
+; UNROLL3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:2028
; UNROLL3-NEXT: s_waitcnt vmcnt(2)
-; UNROLL3-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:2024
+; UNROLL3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:2024
; UNROLL3-NEXT: s_waitcnt vmcnt(1)
-; UNROLL3-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:2020
+; UNROLL3-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:2020
; UNROLL3-NEXT: s_waitcnt vmcnt(0)
-; UNROLL3-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:2016
+; UNROLL3-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:2016
; UNROLL3-NEXT: .LBB8_6: ; %memmove_bwd_loop
; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1
; UNROLL3-NEXT: s_clause 0xb
-; UNROLL3-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen offset:44
-; UNROLL3-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:40
-; UNROLL3-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:36
-; UNROLL3-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:32
-; UNROLL3-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:28
-; UNROLL3-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:24
-; UNROLL3-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:20
-; UNROLL3-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:16
-; UNROLL3-NEXT: buffer_load_dword v10, v1, s[0:3], 0 offen offset:12
-; UNROLL3-NEXT: buffer_load_dword v11, v1, s[0:3], 0 offen offset:8
-; UNROLL3-NEXT: buffer_load_dword v12, v1, s[0:3], 0 offen offset:4
-; UNROLL3-NEXT: buffer_load_dword v13, v1, s[0:3], 0 offen
+; UNROLL3-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:2012
+; UNROLL3-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:2008
+; UNROLL3-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:2004
+; UNROLL3-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:2000
+; UNROLL3-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:1996
+; UNROLL3-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:1992
+; UNROLL3-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:1988
+; UNROLL3-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:1984
+; UNROLL3-NEXT: buffer_load_dword v10, v1, s[0:3], 0 offen offset:1980
+; UNROLL3-NEXT: buffer_load_dword v11, v1, s[0:3], 0 offen offset:1976
+; UNROLL3-NEXT: buffer_load_dword v12, v1, s[0:3], 0 offen offset:1972
+; UNROLL3-NEXT: buffer_load_dword v13, v1, s[0:3], 0 offen offset:1968
; UNROLL3-NEXT: v_subrev_nc_u32_e32 v1, 48, v1
; UNROLL3-NEXT: s_add_u32 s4, s4, 48
; UNROLL3-NEXT: s_addc_u32 s5, s5, 0
; UNROLL3-NEXT: s_waitcnt vmcnt(11)
-; UNROLL3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen offset:44
+; UNROLL3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:2012
; UNROLL3-NEXT: s_waitcnt vmcnt(10)
-; UNROLL3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:40
+; UNROLL3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:2008
; UNROLL3-NEXT: s_waitcnt vmcnt(9)
-; UNROLL3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:36
+; UNROLL3-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:2004
; UNROLL3-NEXT: s_waitcnt vmcnt(8)
-; UNROLL3-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:32
+; UNROLL3-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:2000
; UNROLL3-NEXT: s_waitcnt vmcnt(7)
-; UNROLL3-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen offset:28
+; UNROLL3-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:1996
; UNROLL3-NEXT: s_waitcnt vmcnt(6)
-; UNROLL3-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen offset:24
+; UNROLL3-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:1992
; UNROLL3-NEXT: s_waitcnt vmcnt(5)
-; UNROLL3-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen offset:20
+; UNROLL3-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:1988
; UNROLL3-NEXT: s_waitcnt vmcnt(4)
-; UNROLL3-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen offset:16
+; UNROLL3-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:1984
; UNROLL3-NEXT: s_waitcnt vmcnt(3)
-; UNROLL3-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen offset:12
+; UNROLL3-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:1980
; UNROLL3-NEXT: s_waitcnt vmcnt(2)
-; UNROLL3-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen offset:8
+; UNROLL3-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:1976
; UNROLL3-NEXT: s_waitcnt vmcnt(1)
-; UNROLL3-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen offset:4
+; UNROLL3-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:1972
; UNROLL3-NEXT: s_waitcnt vmcnt(0)
-; UNROLL3-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen
-; UNROLL3-NEXT: v_subrev_nc_u32_e32 v2, 48, v2
+; UNROLL3-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:1968
+; UNROLL3-NEXT: v_subrev_nc_u32_e32 v0, 48, v0
; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], 0
; UNROLL3-NEXT: s_cbranch_scc0 .LBB8_6
; UNROLL3-NEXT: .LBB8_7: ; %Flow17
@@ -12533,233 +12548,233 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; CHECK-NEXT: s_mov_b64 s[4:5], 0
-; CHECK-NEXT: s_mov_b32 s6, exec_lo
+; CHECK-NEXT: s_mov_b32 s4, exec_lo
; CHECK-NEXT: v_cndmask_b32_e32 v3, -1, v0, vcc_lo
; CHECK-NEXT: v_cmpx_ge_u32_e64 v2, v3
-; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6
-; CHECK-NEXT: s_cbranch_execz .LBB9_2
-; CHECK-NEXT: .LBB9_1: ; %memmove_fwd_loop
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_clause 0x3e
-; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:32
-; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:36
-; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:40
-; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:44
-; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:48
-; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:52
-; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:56
-; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:60
-; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:76
-; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84
-; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:80
-; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64
-; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:108
-; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:124
-; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:116
-; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:112
-; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:104
-; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:96
-; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:172
-; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:188
-; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:184
-; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:180
-; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:176
-; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:168
-; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:164
-; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:160
-; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:204
-; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:220
-; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:216
-; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:212
-; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:208
-; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:200
-; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:196
-; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:192
-; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:236
-; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:252
-; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:248
-; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:244
-; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:240
-; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:232
-; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:228
-; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:224
-; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:140
-; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:156
-; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:152
-; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:148
-; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:144
-; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:136
-; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:132
-; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:128
-; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_load_dword v96, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v97, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v98, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v87, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4
-; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo
-; CHECK-NEXT: s_add_u32 s4, s4, 0x100
-; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48
-; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2
-; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo
-; CHECK-NEXT: s_addc_u32 s5, s5, 0
-; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:192
-; CHECK-NEXT: s_waitcnt vmcnt(16)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:176
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:160
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[35:38] offset:144
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[31:34] offset:128
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[27:30] offset:112
-; CHECK-NEXT: s_waitcnt vmcnt(11)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:96
-; CHECK-NEXT: s_waitcnt vmcnt(8)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:128
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[23:26] offset:64
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[19:22] offset:48
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[15:18] offset:32
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[7:10]
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:32
-; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87]
-; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800
-; CHECK-NEXT: s_cbranch_scc1 .LBB9_1
-; CHECK-NEXT: .LBB9_2: ; %Flow10
-; CHECK-NEXT: s_andn2_saveexec_b32 s8, s6
-; CHECK-NEXT: s_cbranch_execz .LBB9_5
-; CHECK-NEXT: ; %bb.3: ; %memmove_bwd_loop.preheader
-; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x700, v2
-; CHECK-NEXT: s_movk_i32 s6, 0xff00
-; CHECK-NEXT: s_mov_b64 s[4:5], 0x700
-; CHECK-NEXT: s_mov_b32 s7, -1
-; CHECK-NEXT: .LBB9_4: ; %memmove_bwd_loop
+; CHECK-NEXT: s_xor_b32 s6, exec_lo, s4
+; CHECK-NEXT: s_cbranch_execz .LBB9_3
+; CHECK-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader
+; CHECK-NEXT: s_mov_b64 s[4:5], 0x800
+; CHECK-NEXT: .LBB9_2: ; %memmove_fwd_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_clause 0x3e
-; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:32
-; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:36
-; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:40
-; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:44
-; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:48
-; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:52
-; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:56
-; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:60
-; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:76
-; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:84
-; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:80
-; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64
-; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:108
-; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:124
-; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:116
-; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:112
-; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:104
-; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:96
-; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:236
-; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:252
-; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:248
-; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:244
-; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:240
-; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:232
-; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:228
-; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:224
-; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:204
-; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:220
-; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:216
-; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:212
-; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:208
-; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:200
-; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:196
-; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:192
-; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:172
-; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:188
-; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:184
-; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:180
-; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:176
-; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:168
-; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:164
-; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:160
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:32
+; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:36
+; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:40
+; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:44
+; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:76
+; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:92
+; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:88
+; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:84
+; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:80
+; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:72
+; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:68
+; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:172
+; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:188
+; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:184
+; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:180
+; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:176
+; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:168
+; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:164
+; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:160
+; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:204
+; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:220
+; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:216
+; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:212
+; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:208
+; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:200
+; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:196
+; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:192
+; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:236
+; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:252
+; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:248
+; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:244
+; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:240
+; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:232
+; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:228
+; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:224
+; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:140
; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:156
; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:152
; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:148
; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:144
-; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:140
-; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:136
-; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:132
-; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:128
-; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_load_dword v96, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v97, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v98, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v87, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4
-; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo
-; CHECK-NEXT: v_add_nc_u32_e32 v2, 0xffffff00, v2
-; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48
+; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:136
+; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:132
+; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:128
+; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:108
+; CHECK-NEXT: buffer_load_dword v87, v2, s[0:3], 0 offen offset:124
+; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:120
+; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:116
+; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen offset:112
+; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:104
+; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:100
+; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:96
+; CHECK-NEXT: buffer_load_dword v96, v2, s[0:3], 0 offen offset:48
+; CHECK-NEXT: buffer_load_dword v97, v2, s[0:3], 0 offen offset:52
+; CHECK-NEXT: buffer_load_dword v98, v2, s[0:3], 0 offen offset:56
+; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:60
+; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, 48
+; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, 0, v1, vcc_lo
+; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2
; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00
-; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo
; CHECK-NEXT: s_addc_u32 s5, s5, -1
-; CHECK-NEXT: s_waitcnt vmcnt(35)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[31:34] offset:192
-; CHECK-NEXT: s_waitcnt vmcnt(32)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[27:30] offset:176
-; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:160
-; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[35:38] offset:144
-; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:128
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:192
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:176
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:160
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:144
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:128
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:112
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:96
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[64:67] offset:128
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[15:18] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[11:14] offset:32
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:16
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0x100, v0
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] offset:64
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:48
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:32
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99]
+; CHECK-NEXT: s_cbranch_scc1 .LBB9_2
+; CHECK-NEXT: .LBB9_3: ; %Flow16
+; CHECK-NEXT: s_andn2_saveexec_b32 s6, s6
+; CHECK-NEXT: s_cbranch_execz .LBB9_6
+; CHECK-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader
+; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0x700, v0
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; CHECK-NEXT: s_movk_i32 s4, 0xf800
+; CHECK-NEXT: s_mov_b32 s5, -1
+; CHECK-NEXT: .LBB9_5: ; %memmove_bwd_loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_clause 0x3e
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:1792
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:1796
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:1800
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:1804
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:1808
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:1812
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:1816
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:1820
+; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:1824
+; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:1828
+; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:1832
+; CHECK-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:1836
+; CHECK-NEXT: buffer_load_dword v18, v2, s[0:3], 0 offen offset:1868
+; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:1884
+; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:1880
+; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:1876
+; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:1872
+; CHECK-NEXT: buffer_load_dword v17, v2, s[0:3], 0 offen offset:1864
+; CHECK-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:1860
+; CHECK-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:1856
+; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:2028
+; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:2044
+; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:2040
+; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:2036
+; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:2032
+; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:2024
+; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:2020
+; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:2016
+; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:1996
+; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:2012
+; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:2008
+; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:2004
+; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:2000
+; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:1992
+; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:1988
+; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:1984
+; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:1964
+; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:1980
+; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:1976
+; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:1972
+; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:1968
+; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:1960
+; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:1956
+; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:1952
+; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:1948
+; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:1944
+; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:1940
+; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:1936
+; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:1932
+; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:1928
+; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:1924
+; CHECK-NEXT: buffer_load_dword v68, v2, s[0:3], 0 offen offset:1920
+; CHECK-NEXT: buffer_load_dword v83, v2, s[0:3], 0 offen offset:1916
+; CHECK-NEXT: buffer_load_dword v82, v2, s[0:3], 0 offen offset:1912
+; CHECK-NEXT: buffer_load_dword v81, v2, s[0:3], 0 offen offset:1908
+; CHECK-NEXT: buffer_load_dword v80, v2, s[0:3], 0 offen offset:1904
+; CHECK-NEXT: buffer_load_dword v87, v2, s[0:3], 0 offen offset:1900
+; CHECK-NEXT: buffer_load_dword v86, v2, s[0:3], 0 offen offset:1896
+; CHECK-NEXT: buffer_load_dword v85, v2, s[0:3], 0 offen offset:1892
+; CHECK-NEXT: buffer_load_dword v84, v2, s[0:3], 0 offen offset:1888
+; CHECK-NEXT: buffer_load_dword v96, v2, s[0:3], 0 offen offset:1840
+; CHECK-NEXT: buffer_load_dword v97, v2, s[0:3], 0 offen offset:1844
+; CHECK-NEXT: buffer_load_dword v98, v2, s[0:3], 0 offen offset:1848
+; CHECK-NEXT: buffer_load_dword v99, v2, s[0:3], 0 offen offset:1852
+; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, 48
+; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, 0, v1, vcc_lo
+; CHECK-NEXT: v_add_nc_u32_e32 v2, 0xffffff00, v2
+; CHECK-NEXT: s_add_u32 s4, s4, 0x100
+; CHECK-NEXT: s_addc_u32 s5, s5, 0
+; CHECK-NEXT: s_waitcnt vmcnt(39)
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:192
+; CHECK-NEXT: s_waitcnt vmcnt(36)
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:176
+; CHECK-NEXT: s_waitcnt vmcnt(31)
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:160
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:144
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:128
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(16)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:112
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:96
; CHECK-NEXT: s_waitcnt vmcnt(12)
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:96
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[68:71] offset:128
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[15:18] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[11:14] offset:32
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:16
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffff00, v0
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 0
; CHECK-NEXT: s_waitcnt vmcnt(8)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:128
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[23:26] offset:64
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[19:22] offset:48
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[15:18] offset:32
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64
-; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[7:10]
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:32
-; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:64
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] offset:48
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87]
-; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7]
-; CHECK-NEXT: s_cbranch_scc0 .LBB9_4
-; CHECK-NEXT: .LBB9_5: ; %Flow11
-; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99]
+; CHECK-NEXT: s_cbranch_scc0 .LBB9_5
+; CHECK-NEXT: .LBB9_6: ; %Flow17
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
;
; ALIGNED-LABEL: memmove_p0_p5_sz2048:
; ALIGNED: ; %bb.0: ; %entry
; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; ALIGNED-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_mov_b64 s[4:5], 0
-; ALIGNED-NEXT: s_mov_b32 s6, exec_lo
+; ALIGNED-NEXT: v_mov_b32_e32 v69, v1
+; ALIGNED-NEXT: v_mov_b32_e32 v68, v0
; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
@@ -12808,2370 +12823,2380 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
-; ALIGNED-NEXT: v_cmpx_ge_u32_e64 v2, v0
-; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s6
-; ALIGNED-NEXT: s_cbranch_execz .LBB9_2
-; ALIGNED-NEXT: .LBB9_1: ; %memmove_fwd_loop
+; ALIGNED-NEXT: s_mov_b32 s4, exec_lo
+; ALIGNED-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[68:69]
+; ALIGNED-NEXT: v_cndmask_b32_e32 v3, -1, v68, vcc_lo
+; ALIGNED-NEXT: v_cmpx_ge_u32_e64 v2, v3
+; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s4
+; ALIGNED-NEXT: s_cbranch_execz .LBB9_3
+; ALIGNED-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader
+; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x800
+; ALIGNED-NEXT: .LBB9_2: ; %memmove_fwd_loop
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
; ALIGNED-NEXT: s_clause 0x3e
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23
-; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24
-; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25
-; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:22
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:23
+; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:24
; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
-; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29
-; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30
-; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31
-; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32
-; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33
-; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34
-; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
-; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35
-; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36
-; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37
-; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:38
-; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:39
-; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:40
-; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:41
-; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:42
-; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:43
-; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:44
-; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:45
-; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:46
-; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:47
-; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:48
-; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:49
-; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:50
-; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:51
-; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:52
-; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:53
-; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:54
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:28
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:29
+; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30
+; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:31
+; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:32
+; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:27
+; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:33
+; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:34
+; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:35
+; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:36
+; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:37
+; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:38
+; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:39
+; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:40
+; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:41
+; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:42
+; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:43
+; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:44
+; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:45
+; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:46
+; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:47
+; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:48
+; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:49
+; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:50
+; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:51
+; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:52
+; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:53
+; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:54
; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:55
-; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:56
-; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:57
-; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58
-; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60
-; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:61
-; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:62
-; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63
-; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:64
-; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:65
-; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:66
-; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59
-; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:67
-; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:68
-; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:69
-; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:70
-; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71
-; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76
-; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77
-; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
-; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78
-; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159
-; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:155
-; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT: s_clause 0x31
-; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174
-; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203
-; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:170
-; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:176
-; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:177
-; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:178
-; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179
-; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:180
-; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:181
-; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182
-; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:183
-; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:188
-; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189
-; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:190
-; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187
-; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:186
-; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:193
-; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194
-; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:196
-; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197
-; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:198
-; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:199
-; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:4
-; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:60
+; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:61
+; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:62
+; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:63
+; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:64
+; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:59
+; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:65
+; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:66
+; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:67
+; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:68
+; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:69
+; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:70
+; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:71
+; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:76
+; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:77
+; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:78
+; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:79
+; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT: s_clause 0x2f
+; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:247
+; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:251
+; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen
+; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00
+; ALIGNED-NEXT: s_addc_u32 s5, s5, -1
+; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0
; ALIGNED-NEXT: s_waitcnt vmcnt(62)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v5
-; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v8
-; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6
-; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12
-; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14
-; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17
-; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 8, v4
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v8, 8, v6
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v10, 8, v9
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v7, v11, 8, v7
+; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v8, v12, 8, v13
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v6, 16, v5
+; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v15
; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18
-; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22
-; ALIGNED-NEXT: v_lshl_or_b32 v12, v27, 8, v25
-; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v14, v28, 8, v26
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 16, v7
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 16, v9
-; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11
-; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13
-; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v15, v30, 8, v29
+; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v32, 8, v33
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v34, 8, v31
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v35
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v39, 8, v37
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v48
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v6, v50, 8, v38
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v7, v51, 8, v52
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v1
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 16, v4
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v7, 16, v6
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v54, 8, v53
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v55, 8, v65
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v64
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v8, 16, v7
+; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v11, v17, 8, v14
+; ALIGNED-NEXT: v_lshl_or_b32 v12, v21, 8, v19
+; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v10, 16, v9
+; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v23
+; ALIGNED-NEXT: v_lshl_or_b32 v14, v28, 8, v26
+; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v12, 16, v11
+; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v15, v25, 8, v22
+; ALIGNED-NEXT: v_lshl_or_b32 v16, v29, 8, v27
+; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v14, 16, v13
+; ALIGNED-NEXT: v_lshl_or_b32 v17, v31, 8, v30
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v35
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v37, 8, v32
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v36, 8, v34
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v16, 16, v15
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v49, 8, v38
+; ALIGNED-NEXT: v_lshl_or_b32 v7, v50, 8, v39
+; ALIGNED-NEXT: v_lshl_or_b32 v8, v51, 8, v48
+; ALIGNED-NEXT: v_lshl_or_b32 v9, v52, 8, v53
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v17
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v55, 8, v54
; ALIGNED-NEXT: s_waitcnt vmcnt(61)
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v67
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
+; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 16, v4
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v66
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v67, 8, v65
+; ALIGNED-NEXT: s_waitcnt vmcnt(59)
+; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v7, 16, v6
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v71, 8, v70
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82
+; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT: s_waitcnt vmcnt(59)
+; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v9, 16, v8
; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(62)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69
+; ALIGNED-NEXT: s_waitcnt vmcnt(58)
+; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v81, 8, v80
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v83, 8, v82
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v6, 16, v5
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(61)
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71
-; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
+; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:212
-; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:213
-; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73
-; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:74
+; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x3
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72
-; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:210
-; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x7
-; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:224
-; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:225
-; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:226
-; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227
-; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:228
-; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:229
-; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230
-; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:231
-; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:236
-; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:237
-; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238
-; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239
-; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235
-; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232
-; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:233
-; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234
-; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:202
; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:211
; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0xc
-; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:240
-; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:241
-; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242
-; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243
-; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:244
-; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245
-; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246
-; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:247
-; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252
-; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:253
-; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:254
-; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255
-; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251
-; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(56)
+; ALIGNED-NEXT: v_lshl_or_b32 v75, v44, 8, v46
+; ALIGNED-NEXT: s_waitcnt vmcnt(52)
+; ALIGNED-NEXT: v_lshl_or_b32 v93, v117, 8, v119
+; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT: s_waitcnt vmcnt(48)
+; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(47)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(46)
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(45)
+; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(44)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(43)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(42)
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(41)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(40)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(39)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(33)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(32)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v3
-; ALIGNED-NEXT: s_clause 0x8
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84
-; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:223
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(35)
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(34)
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(28)
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v84, 8, v4
+; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(28)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:81
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80
-; ALIGNED-NEXT: s_waitcnt vmcnt(10)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(7)
+; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(6)
+; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(5)
+; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(4)
+; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(2)
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v4, 8, v7
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:98
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v3, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v1
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 8, v6
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:102
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:93
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:95
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:103
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:94
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:95
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:93
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:91
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:94
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:91
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:92
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:90
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:101
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v4
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:102
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:89
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:90
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:88
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 8, v4
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:101
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:100
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:97
; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:99
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:100
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:97
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:96
+; ALIGNED-NEXT: s_waitcnt vmcnt(4)
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v4, 8, v7
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:114
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v3, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v1
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 8, v6
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:118
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:109
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:111
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:119
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:110
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:111
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:109
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:107
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:110
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:107
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:108
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:106
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:117
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v4
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:118
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:105
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:106
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:104
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 8, v4
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:117
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:116
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:113
; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:115
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:116
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:113
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:112
+; ALIGNED-NEXT: s_waitcnt vmcnt(4)
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
-; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:130
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:134
-; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:135
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:126
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:127
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v4, 8, v7
+; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:130
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v3, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v1
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 8, v6
; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:125
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:123
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:127
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:135
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:126
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:123
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:124
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:122
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:133
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v4
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:134
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:121
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:122
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1232 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:120
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1244 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1236 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1236 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 8, v4
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:133
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:132
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:129
; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:131
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:132
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:129
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:128
+; ALIGNED-NEXT: s_waitcnt vmcnt(4)
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1256 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1256 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v4, 8, v7
; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:146
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v3, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1244 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v1
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 8, v6
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:150
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:141
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:143
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:151
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:142
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:143
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:141
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:139
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:142
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:139
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:140
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:138
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:149
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v4
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:150
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:137
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:138
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:136
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 8, v4
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:149
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:148
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:145
; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:147
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:148
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:145
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:144
+; ALIGNED-NEXT: s_waitcnt vmcnt(4)
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v4, 8, v7
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v98, 8, v100
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248
-; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249
-; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250
-; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v13
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v15, 8, v20
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v3, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v1
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 8, v6
+; ALIGNED-NEXT: s_clause 0x1
; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: v_lshl_or_b32 v121, v9, 8, v7
+; ALIGNED-NEXT: s_waitcnt vmcnt(12)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(11)
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v4
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v122, 8, v125
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 8, v108
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v99, 8, v101
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v120, 8, v110
-; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v87, 8, v96
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v97, 8, v86
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v95, 8, v120
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v73, 8, v111
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v77, 8, v105
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v63, 8, v79
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v43, 8, v58
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v40, 8, v45
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v111, 8, v123
-; ALIGNED-NEXT: v_lshl_or_b32 v76, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v70, 8, v84
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v94, 8, v95
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v82
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v88, 8, v116
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v118, 8, v103
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v106, 8, v109
-; ALIGNED-NEXT: v_lshl_or_b32 v57, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v69, 8, v83
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v65
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v93, 8, v104
-; ALIGNED-NEXT: v_lshl_or_b32 v41, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v81
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v86, 8, v101
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v81, 8, v71
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v107
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v55, 8, v66
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v79, 8, v88
-; ALIGNED-NEXT: v_lshl_or_b32 v118, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v53
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v71, 8, v51
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v85, 8, v99
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v67, 8, v83
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 8, v90
-; ALIGNED-NEXT: v_lshl_or_b32 v102, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v49
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v74
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v36, 8, v35
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v55, 8, v65
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v54, 8, v64
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v75, 8, v77
-; ALIGNED-NEXT: v_lshl_or_b32 v85, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v37, 8, v48
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v34, 8, v38
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v59
-; ALIGNED-NEXT: v_lshl_or_b32 v80, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v31, 8, v33
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v53
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v52, 8, v50
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v32
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v58, 8, v62
-; ALIGNED-NEXT: v_lshl_or_b32 v54, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v26, 8, v28
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v29, 8, v25
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v37, 8, v48
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v34, 8, v33
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v61, 8, v72
-; ALIGNED-NEXT: v_lshl_or_b32 v52, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v21, 8, v23
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v45, 8, v46
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v18, 8, v17
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v47, 8, v56
-; ALIGNED-NEXT: v_lshl_or_b32 v27, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v19, 8, v22
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v44, 8, v40
-; ALIGNED-NEXT: v_lshl_or_b32 v24, v5, 16, v4
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v12, 8, v14
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:7
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v42, 8, v43
-; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v16, v6, 16, v5
-; ALIGNED-NEXT: v_lshl_or_b32 v6, v8, 8, v10
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v112, 8, v113
-; ALIGNED-NEXT: buffer_store_dword v124, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v6, v121, 16, v6
-; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:1
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1496 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v116, 8, v119
-; ALIGNED-NEXT: v_lshl_or_b32 v108, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v115, 8, v117
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v103, 8, v114
-; ALIGNED-NEXT: v_lshl_or_b32 v92, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2
-; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3
-; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v121, v121, 8, v3
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:12
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT: v_lshl_or_b32 v90, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v38
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v32, 8, v36
+; ALIGNED-NEXT: v_lshl_or_b32 v61, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v29, 8, v30
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v27, 8, v28
+; ALIGNED-NEXT: v_lshl_or_b32 v47, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v24, 8, v26
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v25, 8, v23
+; ALIGNED-NEXT: v_lshl_or_b32 v114, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v18, 8, v21
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v17, 8, v19
+; ALIGNED-NEXT: v_lshl_or_b32 v66, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v15, 8, v20
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v9
+; ALIGNED-NEXT: v_lshl_or_b32 v49, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v14, 8, v16
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 8, v12
+; ALIGNED-NEXT: v_lshl_or_b32 v39, v4, 16, v3
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v122, v0, 8, v1
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:15
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 16, v121
-; ALIGNED-NEXT: v_lshl_or_b32 v121, v5, 8, v124
-; ALIGNED-NEXT: v_lshl_or_b32 v122, v4, 8, v125
-; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 16, v121
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v0, 8, v7
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v13, 8, v1
+; ALIGNED-NEXT: v_lshl_or_b32 v31, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v123, 8, v126
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v121, 8, v124
+; ALIGNED-NEXT: v_lshl_or_b32 v22, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v110, 8, v109
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v107, 8, v106
+; ALIGNED-NEXT: v_lshl_or_b32 v11, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v104, 8, v94
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v92, 8, v91
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v76, 8, v78
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v89, 8, v74
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v62, 8, v72
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v60, 8, v59
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v57, 8, v56
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v75, 16, v4
+; ALIGNED-NEXT: v_lshl_or_b32 v75, v42, 8, v41
+; ALIGNED-NEXT: v_lshl_or_b32 v113, v93, 16, v75
+; ALIGNED-NEXT: v_lshl_or_b32 v75, v102, 8, v112
+; ALIGNED-NEXT: v_lshl_or_b32 v93, v115, 8, v100
+; ALIGNED-NEXT: v_lshl_or_b32 v97, v93, 16, v75
+; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:1
+; ALIGNED-NEXT: v_lshl_or_b32 v93, v96, 8, v80
+; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v75, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v75, v75, 8, v98
+; ALIGNED-NEXT: v_lshl_or_b32 v80, v93, 16, v75
+; ALIGNED-NEXT: v_lshl_or_b32 v75, v82, 8, v70
+; ALIGNED-NEXT: v_lshl_or_b32 v93, v87, 8, v84
+; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:11
+; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v70, v93, 16, v75
+; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14
-; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:11
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:13
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v121, v4, 8, v3
-; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:10
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v122, v1, 8, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:9
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_mov_b32_e32 v4, v5
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 16, v121
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT: v_lshl_or_b32 v93, v84, 8, v82
+; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:9
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v122, v5, 8, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v75, v80, 8, v70
+; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v80, v93, 16, v75
+; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:10
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v75, v84, 8, v96
+; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v121, v1, 8, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v93, v87, 8, v80
+; ALIGNED-NEXT: v_mov_b32_e32 v84, v80
+; ALIGNED-NEXT: v_lshl_or_b32 v98, v93, 16, v75
; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:18
-; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:16
-; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:18
+; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:16
+; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:17
; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 16, v121
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232
-; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:236
-; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224
-; ALIGNED-NEXT: s_clause 0x1 ; 8-byte Folded Reload
-; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:704
-; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:708
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v124
-; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v127, v1, 8, v125
-; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127
+; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:232
+; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:236
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224
+; ALIGNED-NEXT: v_add_co_u32 v3, vcc_lo, v68, 3
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v69, vcc_lo
+; ALIGNED-NEXT: flat_store_byte v[3:4], v100 offset:247
+; ALIGNED-NEXT: flat_store_byte v[3:4], v115 offset:248
+; ALIGNED-NEXT: flat_store_byte v[3:4], v102 offset:246
+; ALIGNED-NEXT: flat_store_byte v[3:4], v117 offset:252
+; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:250
+; ALIGNED-NEXT: flat_store_byte v[3:4], v119 offset:251
+; ALIGNED-NEXT: flat_store_byte v[3:4], v41 offset:249
+; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:245
+; ALIGNED-NEXT: flat_store_byte v[3:4], v59 offset:239
+; ALIGNED-NEXT: flat_store_byte v[3:4], v60 offset:240
+; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:238
+; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:244
+; ALIGNED-NEXT: flat_store_byte v[3:4], v57 offset:242
+; ALIGNED-NEXT: flat_store_byte v[3:4], v46 offset:243
+; ALIGNED-NEXT: flat_store_byte v[3:4], v56 offset:241
+; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:237
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252
+; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244
+; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240
+; ALIGNED-NEXT: flat_store_byte v[3:4], v74 offset:231
+; ALIGNED-NEXT: flat_store_byte v[3:4], v89 offset:232
+; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:230
+; ALIGNED-NEXT: flat_store_byte v[3:4], v92 offset:236
+; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:234
+; ALIGNED-NEXT: flat_store_byte v[3:4], v91 offset:235
+; ALIGNED-NEXT: flat_store_byte v[3:4], v94 offset:233
+; ALIGNED-NEXT: flat_store_byte v[3:4], v78 offset:229
+; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:223
+; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:224
+; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:222
+; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:228
+; ALIGNED-NEXT: flat_store_byte v[3:4], v110 offset:226
+; ALIGNED-NEXT: flat_store_byte v[3:4], v106 offset:227
+; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:225
+; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:221
+; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:204
+; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:196
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:206
+; ALIGNED-NEXT: flat_store_byte v[3:4], v13 offset:208
+; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:207
+; ALIGNED-NEXT: flat_store_byte v[3:4], v18 offset:210
+; ALIGNED-NEXT: flat_store_byte v[3:4], v17 offset:212
+; ALIGNED-NEXT: flat_store_byte v[3:4], v19 offset:211
+; ALIGNED-NEXT: flat_store_byte v[3:4], v21 offset:209
+; ALIGNED-NEXT: flat_store_byte v[3:4], v9 offset:215
+; ALIGNED-NEXT: flat_store_byte v[3:4], v10 offset:216
+; ALIGNED-NEXT: flat_store_byte v[3:4], v15 offset:214
+; ALIGNED-NEXT: flat_store_byte v[3:4], v8 offset:220
+; ALIGNED-NEXT: flat_store_byte v[3:4], v14 offset:218
+; ALIGNED-NEXT: flat_store_byte v[3:4], v12 offset:219
+; ALIGNED-NEXT: flat_store_byte v[3:4], v16 offset:217
+; ALIGNED-NEXT: flat_store_byte v[3:4], v20 offset:213
+; ALIGNED-NEXT: flat_store_byte v[3:4], v7 offset:205
+; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:216
+; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:220
+; ALIGNED-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:212
+; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v23 offset:199
+; ALIGNED-NEXT: flat_store_byte v[3:4], v25 offset:200
+; ALIGNED-NEXT: flat_store_byte v[3:4], v24 offset:198
+; ALIGNED-NEXT: flat_store_byte v[3:4], v27 offset:204
+; ALIGNED-NEXT: flat_store_byte v[3:4], v29 offset:202
+; ALIGNED-NEXT: flat_store_byte v[3:4], v28 offset:203
+; ALIGNED-NEXT: flat_store_byte v[3:4], v30 offset:201
+; ALIGNED-NEXT: flat_store_byte v[3:4], v26 offset:197
+; ALIGNED-NEXT: flat_store_byte v[3:4], v33 offset:191
+; ALIGNED-NEXT: flat_store_byte v[3:4], v34 offset:192
+; ALIGNED-NEXT: flat_store_byte v[3:4], v37 offset:190
+; ALIGNED-NEXT: flat_store_byte v[3:4], v32 offset:196
+; ALIGNED-NEXT: flat_store_byte v[3:4], v35 offset:194
+; ALIGNED-NEXT: flat_store_byte v[3:4], v36 offset:195
+; ALIGNED-NEXT: flat_store_byte v[3:4], v38 offset:193
+; ALIGNED-NEXT: flat_store_byte v[3:4], v48 offset:189
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: v_lshl_or_b32 v98, v127, 8, v75
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_add_co_u32 v121, vcc_lo, v5, s4
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v122, null, s5, v6, vcc_lo
-; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100
-; ALIGNED-NEXT: v_add_co_u32 v5, vcc_lo, v121, 3
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v122, vcc_lo
-; ALIGNED-NEXT: flat_store_byte v[5:6], v7 offset:247
-; ALIGNED-NEXT: flat_store_byte v[5:6], v9 offset:248
-; ALIGNED-NEXT: flat_store_byte v[5:6], v8 offset:246
-; ALIGNED-NEXT: flat_store_byte v[5:6], v11 offset:252
-; ALIGNED-NEXT: flat_store_byte v[5:6], v12 offset:250
-; ALIGNED-NEXT: flat_store_byte v[5:6], v13 offset:251
-; ALIGNED-NEXT: flat_store_byte v[5:6], v14 offset:249
-; ALIGNED-NEXT: flat_store_byte v[5:6], v10 offset:245
-; ALIGNED-NEXT: flat_store_byte v[5:6], v17 offset:239
-; ALIGNED-NEXT: flat_store_byte v[5:6], v18 offset:240
-; ALIGNED-NEXT: flat_store_byte v[5:6], v21 offset:238
-; ALIGNED-NEXT: flat_store_byte v[5:6], v15 offset:244
-; ALIGNED-NEXT: flat_store_byte v[5:6], v19 offset:242
-; ALIGNED-NEXT: flat_store_byte v[5:6], v20 offset:243
-; ALIGNED-NEXT: flat_store_byte v[5:6], v22 offset:241
-; ALIGNED-NEXT: flat_store_byte v[5:6], v23 offset:237
-; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:248
-; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:252
-; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:244
-; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:240
-; ALIGNED-NEXT: flat_store_byte v[5:6], v25 offset:231
-; ALIGNED-NEXT: flat_store_byte v[5:6], v29 offset:232
-; ALIGNED-NEXT: flat_store_byte v[5:6], v26 offset:230
-; ALIGNED-NEXT: flat_store_byte v[5:6], v30 offset:236
-; ALIGNED-NEXT: flat_store_byte v[5:6], v31 offset:234
-; ALIGNED-NEXT: flat_store_byte v[5:6], v32 offset:235
-; ALIGNED-NEXT: flat_store_byte v[5:6], v33 offset:233
-; ALIGNED-NEXT: flat_store_byte v[5:6], v28 offset:229
-; ALIGNED-NEXT: flat_store_byte v[5:6], v35 offset:223
-; ALIGNED-NEXT: flat_store_byte v[5:6], v36 offset:224
-; ALIGNED-NEXT: flat_store_byte v[5:6], v39 offset:222
-; ALIGNED-NEXT: flat_store_byte v[5:6], v34 offset:228
-; ALIGNED-NEXT: flat_store_byte v[5:6], v37 offset:226
-; ALIGNED-NEXT: flat_store_byte v[5:6], v38 offset:227
-; ALIGNED-NEXT: flat_store_byte v[5:6], v48 offset:225
-; ALIGNED-NEXT: flat_store_byte v[5:6], v49 offset:221
-; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:192
-; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:204
-; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200
-; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:196
-; ALIGNED-NEXT: flat_store_byte v[5:6], v70 offset:210
-; ALIGNED-NEXT: flat_store_byte v[5:6], v68 offset:212
-; ALIGNED-NEXT: flat_store_byte v[5:6], v50 offset:206
-; ALIGNED-NEXT: flat_store_byte v[5:6], v71 offset:208
-; ALIGNED-NEXT: flat_store_byte v[5:6], v51 offset:207
-; ALIGNED-NEXT: flat_store_byte v[5:6], v82 offset:211
-; ALIGNED-NEXT: flat_store_byte v[5:6], v84 offset:209
-; ALIGNED-NEXT: flat_store_byte v[5:6], v65 offset:215
-; ALIGNED-NEXT: flat_store_byte v[5:6], v64 offset:216
-; ALIGNED-NEXT: flat_store_byte v[5:6], v69 offset:214
-; ALIGNED-NEXT: flat_store_byte v[5:6], v55 offset:220
-; ALIGNED-NEXT: flat_store_byte v[5:6], v67 offset:218
-; ALIGNED-NEXT: flat_store_byte v[5:6], v66 offset:219
-; ALIGNED-NEXT: flat_store_byte v[5:6], v81 offset:217
-; ALIGNED-NEXT: flat_store_byte v[5:6], v83 offset:213
-; ALIGNED-NEXT: flat_store_byte v[5:6], v53 offset:205
-; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:216
-; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:220
-; ALIGNED-NEXT: buffer_store_dword v92, off, s[0:3], s32 offset:212
-; ALIGNED-NEXT: buffer_store_dword v108, off, s[0:3], s32 offset:208
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1496 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v86 offset:199
-; ALIGNED-NEXT: flat_store_byte v[5:6], v97 offset:200
-; ALIGNED-NEXT: flat_store_byte v[5:6], v87 offset:198
-; ALIGNED-NEXT: flat_store_byte v[5:6], v98 offset:204
-; ALIGNED-NEXT: flat_store_byte v[5:6], v99 offset:202
-; ALIGNED-NEXT: flat_store_byte v[5:6], v100 offset:203
-; ALIGNED-NEXT: flat_store_byte v[5:6], v101 offset:201
-; ALIGNED-NEXT: flat_store_byte v[5:6], v96 offset:197
-; ALIGNED-NEXT: flat_store_byte v[5:6], v113 offset:191
-; ALIGNED-NEXT: flat_store_byte v[5:6], v112 offset:192
-; ALIGNED-NEXT: flat_store_byte v[5:6], v116 offset:190
-; ALIGNED-NEXT: flat_store_byte v[5:6], v103 offset:196
-; ALIGNED-NEXT: flat_store_byte v[5:6], v115 offset:194
-; ALIGNED-NEXT: flat_store_byte v[5:6], v114 offset:195
-; ALIGNED-NEXT: flat_store_byte v[5:6], v117 offset:193
-; ALIGNED-NEXT: flat_store_byte v[5:6], v119 offset:189
-; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
-; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800
+; ALIGNED-NEXT: v_lshl_or_b32 v127, v93, 8, v80
+; ALIGNED-NEXT: v_lshl_or_b32 v98, v98, 16, v127
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v40 offset:183
-; ALIGNED-NEXT: flat_store_byte v[5:6], v44 offset:184
-; ALIGNED-NEXT: flat_store_byte v[5:6], v42 offset:182
-; ALIGNED-NEXT: flat_store_byte v[5:6], v45 offset:188
-; ALIGNED-NEXT: flat_store_byte v[5:6], v47 offset:186
-; ALIGNED-NEXT: flat_store_byte v[5:6], v46 offset:187
-; ALIGNED-NEXT: flat_store_byte v[5:6], v56 offset:185
-; ALIGNED-NEXT: flat_store_byte v[5:6], v43 offset:181
-; ALIGNED-NEXT: flat_store_byte v[5:6], v59 offset:175
-; ALIGNED-NEXT: flat_store_byte v[5:6], v60 offset:176
-; ALIGNED-NEXT: flat_store_byte v[5:6], v63 offset:174
-; ALIGNED-NEXT: flat_store_byte v[5:6], v58 offset:180
-; ALIGNED-NEXT: flat_store_byte v[5:6], v61 offset:178
-; ALIGNED-NEXT: flat_store_byte v[5:6], v62 offset:179
-; ALIGNED-NEXT: flat_store_byte v[5:6], v72 offset:177
-; ALIGNED-NEXT: flat_store_byte v[5:6], v73 offset:173
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v50 offset:183
+; ALIGNED-NEXT: flat_store_byte v[3:4], v52 offset:184
+; ALIGNED-NEXT: flat_store_byte v[3:4], v51 offset:182
+; ALIGNED-NEXT: flat_store_byte v[3:4], v54 offset:188
+; ALIGNED-NEXT: flat_store_byte v[3:4], v55 offset:186
+; ALIGNED-NEXT: flat_store_byte v[3:4], v64 offset:187
+; ALIGNED-NEXT: flat_store_byte v[3:4], v65 offset:185
+; ALIGNED-NEXT: flat_store_byte v[3:4], v53 offset:181
+; ALIGNED-NEXT: flat_store_byte v[3:4], v71 offset:175
+; ALIGNED-NEXT: flat_store_byte v[3:4], v81 offset:176
+; ALIGNED-NEXT: flat_store_byte v[3:4], v86 offset:174
+; ALIGNED-NEXT: flat_store_byte v[3:4], v67 offset:180
+; ALIGNED-NEXT: flat_store_byte v[3:4], v85 offset:178
+; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:179
+; ALIGNED-NEXT: flat_store_byte v[3:4], v99 offset:177
+; ALIGNED-NEXT: flat_store_byte v[3:4], v101 offset:173
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v74 offset:167
-; ALIGNED-NEXT: flat_store_byte v[5:6], v78 offset:168
-; ALIGNED-NEXT: flat_store_byte v[5:6], v75 offset:166
-; ALIGNED-NEXT: flat_store_byte v[5:6], v79 offset:172
-; ALIGNED-NEXT: flat_store_byte v[5:6], v89 offset:170
-; ALIGNED-NEXT: flat_store_byte v[5:6], v88 offset:171
-; ALIGNED-NEXT: flat_store_byte v[5:6], v90 offset:169
-; ALIGNED-NEXT: flat_store_byte v[5:6], v77 offset:165
-; ALIGNED-NEXT: flat_store_byte v[5:6], v95 offset:159
-; ALIGNED-NEXT: flat_store_byte v[5:6], v94 offset:160
-; ALIGNED-NEXT: flat_store_byte v[5:6], v106 offset:158
-; ALIGNED-NEXT: flat_store_byte v[5:6], v93 offset:164
-; ALIGNED-NEXT: flat_store_byte v[5:6], v105 offset:162
-; ALIGNED-NEXT: flat_store_byte v[5:6], v104 offset:163
-; ALIGNED-NEXT: flat_store_byte v[5:6], v107 offset:161
-; ALIGNED-NEXT: flat_store_byte v[5:6], v109 offset:157
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v103 offset:167
+; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:168
+; ALIGNED-NEXT: flat_store_byte v[3:4], v88 offset:166
+; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:172
+; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:170
+; ALIGNED-NEXT: flat_store_byte v[3:4], v45 offset:171
+; ALIGNED-NEXT: flat_store_byte v[3:4], v58 offset:169
+; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:165
+; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:159
+; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:160
+; ALIGNED-NEXT: flat_store_byte v[3:4], v95 offset:158
+; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:164
+; ALIGNED-NEXT: flat_store_byte v[3:4], v77 offset:162
+; ALIGNED-NEXT: flat_store_byte v[3:4], v79 offset:163
+; ALIGNED-NEXT: flat_store_byte v[3:4], v105 offset:161
+; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:157
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v110 offset:151
-; ALIGNED-NEXT: flat_store_byte v[5:6], v120 offset:152
-; ALIGNED-NEXT: flat_store_byte v[5:6], v111 offset:150
-; ALIGNED-NEXT: flat_store_byte v[5:6], v126 offset:156
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:154
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:155
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:153
-; ALIGNED-NEXT: flat_store_byte v[5:6], v123 offset:149
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:151
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:143
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:152
+; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:150
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:144
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:156
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:142
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:154
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:148
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:155
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:146
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:153
+; ALIGNED-NEXT: flat_store_byte v[3:4], v125 offset:149
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:147
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:143
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:144
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:142
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:145
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:148
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:141
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:146
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:147
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:145
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:141
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:135
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:136
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:134
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:135
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:140
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:136
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:134
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:138
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:140
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:139
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:138
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:137
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:139
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:133
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:137
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:127
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:133
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:128
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:127
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1256 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:126
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:128
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:132
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:126
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:132
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:130
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:130
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:131
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:131
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:129
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:129
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1244 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:128
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[68:69], v0 offset:128
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1240 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:364
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:356
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1244 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:119
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1240 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:120
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:356
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1236 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:118
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:119
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:124
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:120
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:122
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:118
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:123
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:124
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:121
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:122
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:117
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:123
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:111
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:121
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:117
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:112
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:111
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:110
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:112
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:116
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:110
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:114
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:115
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:116
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:113
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:114
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:109
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:115
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:113
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:109
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:380
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:103
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:104
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:103
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:102
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:104
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:108
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:102
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:106
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:107
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:108
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:105
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:106
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:101
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:107
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:95
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:105
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:96
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:101
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:94
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:95
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:100
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:96
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:94
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:98
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:100
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:99
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:98
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:97
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:99
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:93
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:97
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:93
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:87
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:87
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:88
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:86
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:92
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:88
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:90
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:86
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:91
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:92
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:89
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:90
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:85
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:91
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:79
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:89
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:80
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:85
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:78
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:79
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:84
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:80
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:78
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:82
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:84
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:83
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:82
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:81
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:83
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:77
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:81
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:77
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:71
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:72
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:71
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:70
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:72
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:76
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:70
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:74
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:76
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:75
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:74
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:73
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:75
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:69
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:73
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:63
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:69
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:64
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:63
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:62
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:64
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:68
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:62
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:66
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:68
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:67
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:66
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:65
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:67
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:64
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:65
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[68:69], v0 offset:64
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:58
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:55
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:56
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:58
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:54
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:55
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:60
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:56
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:59
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:57
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:53
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:54
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:50
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:60
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:47
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:59
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:48
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:57
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:46
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:53
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:52
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:50
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:51
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:47
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:49
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:48
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:45
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:46
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:52
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:51
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:49
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:40
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:45
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:39
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:38
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:37
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:44
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:40
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:43
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:39
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:42
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:38
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:41
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:37
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:32
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:44
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:31
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:43
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:30
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:42
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:32
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:41
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:36
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:32
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:35
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:31
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:34
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:30
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:33
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[68:69], v0 offset:32
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:36
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388
-; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:384
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:35
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:23
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:34
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:24
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:33
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388
+; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:384
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:22
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:23
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:28
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:26
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:24
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:27
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:22
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:28
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:25
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:26
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:21
-; ALIGNED-NEXT: flat_store_byte v[5:6], v124 offset:15
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:27
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:16
-; ALIGNED-NEXT: flat_store_byte v[5:6], v1 offset:14
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:25
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:20
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:21
+; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:15
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:16
+; ALIGNED-NEXT: flat_store_byte v[3:4], v93 offset:14
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:18
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:20
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:19
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:18
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:17
-; ALIGNED-NEXT: flat_store_byte v[121:122], v125 offset:16
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:19
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:17
+; ALIGNED-NEXT: flat_store_byte v[68:69], v80 offset:16
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[5:6], v3 offset:7
-; ALIGNED-NEXT: flat_store_byte v[5:6], v4 offset:8
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:10
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:6
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v84 offset:7
+; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:8
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:12
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:10
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:11
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:6
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:9
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:8
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:12
+; ALIGNED-NEXT: flat_store_byte v[3:4], v82 offset:11
+; ALIGNED-NEXT: flat_store_byte v[3:4], v70 offset:9
+; ALIGNED-NEXT: flat_store_byte v[68:69], v96 offset:8
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:2
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[68:69], v0 offset:2
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[68:69], v0 offset:1
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:1
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[68:69], v0 offset:4
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:4
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[68:69], v0
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Reload
+; ALIGNED-NEXT: v_add_co_u32 v68, vcc_lo, 0x100, v68
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v69, null, 0, v69, vcc_lo
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:2
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:3
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:4
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:4
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:2
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[121:122], v0
-; ALIGNED-NEXT: s_cbranch_scc1 .LBB9_1
-; ALIGNED-NEXT: .LBB9_2: ; %Flow10
-; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6
-; ALIGNED-NEXT: s_cbranch_execz .LBB9_5
-; ALIGNED-NEXT: ; %bb.3: ; %memmove_bwd_loop.preheader
-; ALIGNED-NEXT: v_add_nc_u32_e32 v6, 0x700, v2
-; ALIGNED-NEXT: s_movk_i32 s6, 0xff00
-; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x700
-; ALIGNED-NEXT: s_mov_b32 s7, -1
-; ALIGNED-NEXT: .LBB9_4: ; %memmove_bwd_loop
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:3
+; ALIGNED-NEXT: s_cbranch_scc1 .LBB9_2
+; ALIGNED-NEXT: .LBB9_3: ; %Flow16
+; ALIGNED-NEXT: s_andn2_saveexec_b32 s6, s6
+; ALIGNED-NEXT: s_cbranch_execz .LBB9_6
+; ALIGNED-NEXT: ; %bb.4: ; %memmove_bwd_loop.preheader
+; ALIGNED-NEXT: v_add_co_u32 v84, vcc_lo, 0x700, v68
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v85, null, 0, v69, vcc_lo
+; ALIGNED-NEXT: s_movk_i32 s4, 0xf800
+; ALIGNED-NEXT: s_mov_b32 s5, -1
+; ALIGNED-NEXT: .LBB9_5: ; %memmove_bwd_loop
; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
; ALIGNED-NEXT: s_clause 0x3e
-; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:20
-; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:21
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:22
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:23
-; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:24
-; ALIGNED-NEXT: buffer_load_ubyte v10, v6, s[0:3], 0 offen offset:25
-; ALIGNED-NEXT: buffer_load_ubyte v127, v6, s[0:3], 0 offen offset:19
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:28
-; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:29
-; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:30
-; ALIGNED-NEXT: buffer_load_ubyte v9, v6, s[0:3], 0 offen offset:31
-; ALIGNED-NEXT: buffer_load_ubyte v12, v6, s[0:3], 0 offen offset:26
-; ALIGNED-NEXT: buffer_load_ubyte v14, v6, s[0:3], 0 offen offset:32
-; ALIGNED-NEXT: buffer_load_ubyte v15, v6, s[0:3], 0 offen offset:33
-; ALIGNED-NEXT: buffer_load_ubyte v11, v6, s[0:3], 0 offen offset:27
-; ALIGNED-NEXT: buffer_load_ubyte v19, v6, s[0:3], 0 offen offset:34
-; ALIGNED-NEXT: buffer_load_ubyte v20, v6, s[0:3], 0 offen offset:35
-; ALIGNED-NEXT: buffer_load_ubyte v13, v6, s[0:3], 0 offen offset:36
-; ALIGNED-NEXT: buffer_load_ubyte v16, v6, s[0:3], 0 offen offset:37
-; ALIGNED-NEXT: buffer_load_ubyte v17, v6, s[0:3], 0 offen offset:38
-; ALIGNED-NEXT: buffer_load_ubyte v18, v6, s[0:3], 0 offen offset:39
-; ALIGNED-NEXT: buffer_load_ubyte v22, v6, s[0:3], 0 offen offset:40
-; ALIGNED-NEXT: buffer_load_ubyte v26, v6, s[0:3], 0 offen offset:41
-; ALIGNED-NEXT: buffer_load_ubyte v27, v6, s[0:3], 0 offen offset:42
-; ALIGNED-NEXT: buffer_load_ubyte v28, v6, s[0:3], 0 offen offset:43
-; ALIGNED-NEXT: buffer_load_ubyte v21, v6, s[0:3], 0 offen offset:44
-; ALIGNED-NEXT: buffer_load_ubyte v23, v6, s[0:3], 0 offen offset:45
-; ALIGNED-NEXT: buffer_load_ubyte v24, v6, s[0:3], 0 offen offset:46
-; ALIGNED-NEXT: buffer_load_ubyte v25, v6, s[0:3], 0 offen offset:47
-; ALIGNED-NEXT: buffer_load_ubyte v30, v6, s[0:3], 0 offen offset:48
-; ALIGNED-NEXT: buffer_load_ubyte v31, v6, s[0:3], 0 offen offset:49
-; ALIGNED-NEXT: buffer_load_ubyte v33, v6, s[0:3], 0 offen offset:50
-; ALIGNED-NEXT: buffer_load_ubyte v34, v6, s[0:3], 0 offen offset:51
-; ALIGNED-NEXT: buffer_load_ubyte v32, v6, s[0:3], 0 offen offset:52
-; ALIGNED-NEXT: buffer_load_ubyte v37, v6, s[0:3], 0 offen offset:53
-; ALIGNED-NEXT: buffer_load_ubyte v35, v6, s[0:3], 0 offen offset:54
-; ALIGNED-NEXT: buffer_load_ubyte v36, v6, s[0:3], 0 offen offset:55
-; ALIGNED-NEXT: buffer_load_ubyte v38, v6, s[0:3], 0 offen offset:56
-; ALIGNED-NEXT: buffer_load_ubyte v51, v6, s[0:3], 0 offen offset:57
-; ALIGNED-NEXT: buffer_load_ubyte v39, v6, s[0:3], 0 offen offset:60
-; ALIGNED-NEXT: buffer_load_ubyte v50, v6, s[0:3], 0 offen offset:61
-; ALIGNED-NEXT: buffer_load_ubyte v48, v6, s[0:3], 0 offen offset:62
-; ALIGNED-NEXT: buffer_load_ubyte v49, v6, s[0:3], 0 offen offset:63
-; ALIGNED-NEXT: buffer_load_ubyte v52, v6, s[0:3], 0 offen offset:58
-; ALIGNED-NEXT: buffer_load_ubyte v29, v6, s[0:3], 0 offen offset:64
-; ALIGNED-NEXT: buffer_load_ubyte v55, v6, s[0:3], 0 offen offset:65
-; ALIGNED-NEXT: buffer_load_ubyte v53, v6, s[0:3], 0 offen offset:59
-; ALIGNED-NEXT: buffer_load_ubyte v66, v6, s[0:3], 0 offen offset:66
-; ALIGNED-NEXT: buffer_load_ubyte v67, v6, s[0:3], 0 offen offset:67
-; ALIGNED-NEXT: buffer_load_ubyte v54, v6, s[0:3], 0 offen offset:68
-; ALIGNED-NEXT: buffer_load_ubyte v64, v6, s[0:3], 0 offen offset:69
-; ALIGNED-NEXT: buffer_load_ubyte v65, v6, s[0:3], 0 offen offset:70
-; ALIGNED-NEXT: buffer_load_ubyte v68, v6, s[0:3], 0 offen offset:71
-; ALIGNED-NEXT: buffer_load_ubyte v69, v6, s[0:3], 0 offen offset:76
-; ALIGNED-NEXT: buffer_load_ubyte v81, v6, s[0:3], 0 offen offset:75
-; ALIGNED-NEXT: buffer_load_ubyte v70, v6, s[0:3], 0 offen offset:77
-; ALIGNED-NEXT: buffer_load_ubyte v71, v6, s[0:3], 0 offen offset:78
-; ALIGNED-NEXT: buffer_load_ubyte v80, v6, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT: buffer_load_ubyte v123, v6, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT: buffer_load_ubyte v122, v6, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT: buffer_load_ubyte v111, v6, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT: buffer_load_ubyte v108, v6, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT: buffer_load_ubyte v105, v6, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT: s_clause 0x34
-; ALIGNED-NEXT: buffer_load_ubyte v94, v6, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT: buffer_load_ubyte v92, v6, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT: buffer_load_ubyte v106, v6, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT: buffer_load_ubyte v95, v6, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT: buffer_load_ubyte v104, v6, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT: buffer_load_ubyte v91, v6, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT: buffer_load_ubyte v78, v6, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT: buffer_load_ubyte v77, v6, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT: buffer_load_ubyte v76, v6, s[0:3], 0 offen offset:174
-; ALIGNED-NEXT: buffer_load_ubyte v75, v6, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT: buffer_load_ubyte v74, v6, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT: buffer_load_ubyte v73, v6, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT: buffer_load_ubyte v63, v6, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT: buffer_load_ubyte v62, v6, s[0:3], 0 offen offset:170
-; ALIGNED-NEXT: buffer_load_ubyte v61, v6, s[0:3], 0 offen offset:176
-; ALIGNED-NEXT: buffer_load_ubyte v59, v6, s[0:3], 0 offen offset:177
-; ALIGNED-NEXT: buffer_load_ubyte v47, v6, s[0:3], 0 offen offset:178
-; ALIGNED-NEXT: buffer_load_ubyte v56, v6, s[0:3], 0 offen offset:179
-; ALIGNED-NEXT: buffer_load_ubyte v60, v6, s[0:3], 0 offen offset:180
-; ALIGNED-NEXT: buffer_load_ubyte v57, v6, s[0:3], 0 offen offset:181
-; ALIGNED-NEXT: buffer_load_ubyte v58, v6, s[0:3], 0 offen offset:182
-; ALIGNED-NEXT: buffer_load_ubyte v46, v6, s[0:3], 0 offen offset:183
-; ALIGNED-NEXT: buffer_load_ubyte v44, v6, s[0:3], 0 offen offset:188
-; ALIGNED-NEXT: buffer_load_ubyte v43, v6, s[0:3], 0 offen offset:189
-; ALIGNED-NEXT: buffer_load_ubyte v42, v6, s[0:3], 0 offen offset:190
-; ALIGNED-NEXT: buffer_load_ubyte v41, v6, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT: buffer_load_ubyte v40, v6, s[0:3], 0 offen offset:187
-; ALIGNED-NEXT: buffer_load_ubyte v119, v6, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT: buffer_load_ubyte v118, v6, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT: buffer_load_ubyte v116, v6, s[0:3], 0 offen offset:186
-; ALIGNED-NEXT: buffer_load_ubyte v115, v6, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT: buffer_load_ubyte v112, v6, s[0:3], 0 offen offset:193
-; ALIGNED-NEXT: buffer_load_ubyte v101, v6, s[0:3], 0 offen offset:194
-; ALIGNED-NEXT: buffer_load_ubyte v100, v6, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT: buffer_load_ubyte v113, v6, s[0:3], 0 offen offset:196
-; ALIGNED-NEXT: buffer_load_ubyte v103, v6, s[0:3], 0 offen offset:197
-; ALIGNED-NEXT: buffer_load_ubyte v102, v6, s[0:3], 0 offen offset:198
-; ALIGNED-NEXT: buffer_load_ubyte v99, v6, s[0:3], 0 offen offset:199
-; ALIGNED-NEXT: buffer_load_ubyte v97, v6, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT: buffer_load_ubyte v87, v6, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT: buffer_load_ubyte v96, v6, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT: buffer_load_ubyte v86, v6, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT: buffer_load_ubyte v85, v6, s[0:3], 0 offen offset:203
-; ALIGNED-NEXT: buffer_load_ubyte v84, v6, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT: buffer_load_ubyte v83, v6, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT: buffer_load_ubyte v82, v6, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT: buffer_load_ubyte v124, v6, s[0:3], 0 offen offset:155
-; ALIGNED-NEXT: buffer_load_ubyte v120, v6, s[0:3], 0 offen offset:2
-; ALIGNED-NEXT: buffer_load_ubyte v89, v6, s[0:3], 0 offen offset:3
-; ALIGNED-NEXT: buffer_load_ubyte v93, v6, s[0:3], 0 offen offset:4
-; ALIGNED-NEXT: buffer_load_ubyte v109, v6, s[0:3], 0 offen offset:5
-; ALIGNED-NEXT: buffer_load_ubyte v110, v6, s[0:3], 0 offen offset:6
-; ALIGNED-NEXT: buffer_load_ubyte v121, v6, s[0:3], 0 offen offset:7
-; ALIGNED-NEXT: s_waitcnt vmcnt(62)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
-; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v7, 8, v4
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v9, 8, v8
-; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v5
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v11, 8, v12
-; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14
-; ALIGNED-NEXT: v_lshl_or_b32 v8, v20, 8, v19
-; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13
-; ALIGNED-NEXT: v_lshl_or_b32 v10, v18, 8, v17
-; ALIGNED-NEXT: v_lshl_or_b32 v11, v26, 8, v22
-; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v27
-; ALIGNED-NEXT: v_lshl_or_b32 v13, v23, 8, v21
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v14, v25, 8, v24
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v5, 16, v4
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v8, 16, v7
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 16, v9
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v12, 16, v11
-; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13
-; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v36, 8, v35
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v39
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v49, 8, v48
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v51, 8, v38
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v4, 16, v3
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:85
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 16, v5
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v64, 8, v54
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v65
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:86
-; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:82
-; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:87
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:83
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:1812
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:1813
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1814
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1815
+; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:1811
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1820
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:1821
+; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:1822
+; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:1823
+; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1816
+; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:1817
+; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:1818
+; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:1819
+; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:1824
+; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:1825
+; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:1826
+; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:1827
+; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:1828
+; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1829
+; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:1830
+; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:1831
+; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:1832
+; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:1833
+; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:1834
+; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:1835
+; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:1836
+; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:1837
+; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:1838
+; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:1839
+; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:1840
+; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:1841
+; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:1842
+; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:1843
+; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:1844
+; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:1845
+; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:1846
+; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:1847
+; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:1852
+; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:1853
+; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:1854
+; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:1855
+; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:1848
+; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:1849
+; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:1850
+; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:1851
+; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:1856
+; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:1857
+; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:1858
+; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:1859
+; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:1860
+; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:1861
+; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:1862
+; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:1863
+; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:1868
+; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:1869
+; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:1870
+; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:1871
+; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:1867
+; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:1944
+; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:1945
+; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:1946
+; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:1952
+; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:1953
+; ALIGNED-NEXT: s_clause 0x2b
+; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:1954
+; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:1955
+; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:1956
+; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:1957
+; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:1958
+; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:1959
+; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:1964
+; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:1965
+; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:1966
+; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:1967
+; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:1963
+; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:1960
+; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:1961
+; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:1962
+; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:2028
+; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:2029
+; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:2030
+; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:2031
+; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:2027
+; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:2024
+; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:2025
+; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:2026
+; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:2032
+; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:2033
+; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:2034
+; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:2035
+; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:2036
+; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:2037
+; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:2038
+; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:2039
+; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:2044
+; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:2045
+; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:2046
+; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:2047
+; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:2043
+; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:2040
+; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:2041
+; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:2042
+; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:1792
+; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:1795
+; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:1796
+; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:1797
+; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:1798
+; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:1799
+; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100
+; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
+; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], 0
; ALIGNED-NEXT: s_waitcnt vmcnt(62)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:74
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71
-; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:73
-; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v80, v6, s[0:3], 0 offen offset:212
-; ALIGNED-NEXT: buffer_load_ubyte v67, v6, s[0:3], 0 offen offset:213
-; ALIGNED-NEXT: buffer_load_ubyte v70, v6, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT: buffer_load_ubyte v64, v6, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT: buffer_load_ubyte v65, v6, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:72
-; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x7
-; ALIGNED-NEXT: buffer_load_ubyte v71, v6, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT: buffer_load_ubyte v66, v6, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT: buffer_load_ubyte v53, v6, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT: buffer_load_ubyte v52, v6, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT: buffer_load_ubyte v69, v6, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT: buffer_load_ubyte v55, v6, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT: buffer_load_ubyte v54, v6, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT: buffer_load_ubyte v51, v6, s[0:3], 0 offen offset:223
-; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v49, v6, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT: buffer_load_ubyte v38, v6, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT: buffer_load_ubyte v39, v6, s[0:3], 0 offen offset:210
-; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x7
-; ALIGNED-NEXT: buffer_load_ubyte v37, v6, s[0:3], 0 offen offset:224
-; ALIGNED-NEXT: buffer_load_ubyte v35, v6, s[0:3], 0 offen offset:225
-; ALIGNED-NEXT: buffer_load_ubyte v31, v6, s[0:3], 0 offen offset:226
-; ALIGNED-NEXT: buffer_load_ubyte v32, v6, s[0:3], 0 offen offset:227
-; ALIGNED-NEXT: buffer_load_ubyte v36, v6, s[0:3], 0 offen offset:228
-; ALIGNED-NEXT: buffer_load_ubyte v33, v6, s[0:3], 0 offen offset:229
-; ALIGNED-NEXT: buffer_load_ubyte v34, v6, s[0:3], 0 offen offset:230
-; ALIGNED-NEXT: buffer_load_ubyte v30, v6, s[0:3], 0 offen offset:231
-; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x4
-; ALIGNED-NEXT: buffer_load_ubyte v29, v6, s[0:3], 0 offen offset:236
-; ALIGNED-NEXT: buffer_load_ubyte v27, v6, s[0:3], 0 offen offset:237
-; ALIGNED-NEXT: buffer_load_ubyte v28, v6, s[0:3], 0 offen offset:238
-; ALIGNED-NEXT: buffer_load_ubyte v26, v6, s[0:3], 0 offen offset:239
-; ALIGNED-NEXT: buffer_load_ubyte v25, v6, s[0:3], 0 offen offset:235
-; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v24, v6, s[0:3], 0 offen offset:232
-; ALIGNED-NEXT: buffer_load_ubyte v23, v6, s[0:3], 0 offen offset:233
-; ALIGNED-NEXT: buffer_load_ubyte v22, v6, s[0:3], 0 offen offset:234
-; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 8, v4
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v8, 8, v6
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v10, 8, v9
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v7, v11, 8, v7
+; ALIGNED-NEXT: v_lshl_or_b32 v8, v12, 8, v13
+; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v6, 16, v5
+; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v15
+; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18
+; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v8, 16, v7
+; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v11, v17, 8, v14
+; ALIGNED-NEXT: v_lshl_or_b32 v12, v21, 8, v19
+; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v10, 16, v9
+; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v23
+; ALIGNED-NEXT: v_lshl_or_b32 v14, v29, 8, v26
+; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v12, 16, v11
; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0xc
-; ALIGNED-NEXT: buffer_load_ubyte v21, v6, s[0:3], 0 offen offset:240
-; ALIGNED-NEXT: buffer_load_ubyte v19, v6, s[0:3], 0 offen offset:241
-; ALIGNED-NEXT: buffer_load_ubyte v15, v6, s[0:3], 0 offen offset:242
-; ALIGNED-NEXT: buffer_load_ubyte v16, v6, s[0:3], 0 offen offset:243
-; ALIGNED-NEXT: buffer_load_ubyte v20, v6, s[0:3], 0 offen offset:244
-; ALIGNED-NEXT: buffer_load_ubyte v17, v6, s[0:3], 0 offen offset:245
-; ALIGNED-NEXT: buffer_load_ubyte v18, v6, s[0:3], 0 offen offset:246
-; ALIGNED-NEXT: buffer_load_ubyte v14, v6, s[0:3], 0 offen offset:247
-; ALIGNED-NEXT: buffer_load_ubyte v13, v6, s[0:3], 0 offen offset:252
-; ALIGNED-NEXT: buffer_load_ubyte v11, v6, s[0:3], 0 offen offset:253
-; ALIGNED-NEXT: buffer_load_ubyte v12, v6, s[0:3], 0 offen offset:254
-; ALIGNED-NEXT: buffer_load_ubyte v10, v6, s[0:3], 0 offen offset:255
-; ALIGNED-NEXT: buffer_load_ubyte v9, v6, s[0:3], 0 offen offset:251
-; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(57)
-; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v89, v89, 8, v120
-; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v126, v6, s[0:3], 0 offen offset:15
-; ALIGNED-NEXT: buffer_load_ubyte v120, v6, s[0:3], 0 offen offset:11
+; ALIGNED-NEXT: v_lshl_or_b32 v15, v25, 8, v22
+; ALIGNED-NEXT: v_lshl_or_b32 v16, v28, 8, v27
+; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v14, 16, v13
+; ALIGNED-NEXT: v_lshl_or_b32 v17, v32, 8, v31
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v34
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v38, 8, v33
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v37, 8, v36
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v16, 16, v15
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v39
+; ALIGNED-NEXT: v_lshl_or_b32 v7, v50, 8, v48
+; ALIGNED-NEXT: v_lshl_or_b32 v8, v52, 8, v49
+; ALIGNED-NEXT: v_lshl_or_b32 v9, v54, 8, v53
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v17
+; ALIGNED-NEXT: s_waitcnt vmcnt(60)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v30
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:1865
+; ALIGNED-NEXT: s_waitcnt vmcnt(59)
+; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 16, v4
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v67
; ALIGNED-NEXT: s_waitcnt vmcnt(57)
-; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(56)
-; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v5, v65, 8, v55
; ALIGNED-NEXT: s_waitcnt vmcnt(55)
-; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(54)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v7, 16, v6
+; ALIGNED-NEXT: v_lshl_or_b32 v6, v69, 8, v66
+; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1874
+; ALIGNED-NEXT: s_waitcnt vmcnt(54)
+; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(53)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(52)
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(51)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(50)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(49)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(48)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(42)
+; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v9, 16, v8
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:1879
+; ALIGNED-NEXT: s_waitcnt vmcnt(53)
+; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v70
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v81, 8, v80
+; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v6, 16, v5
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1877
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1878
+; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:1980
+; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:1981
+; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:1982
+; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:1983
+; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:1979
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1866
+; ALIGNED-NEXT: s_waitcnt vmcnt(60)
+; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v2
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:84
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:81
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:80
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:1864
+; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:1976
+; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:1977
+; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:1978
+; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:1984
+; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:1985
+; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:1986
+; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:1987
+; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:1988
+; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:1989
+; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:1990
+; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:1991
+; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:1996
+; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:1997
+; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:1998
+; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:1999
+; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:1995
+; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:1992
+; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:1993
+; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:1994
+; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:2004
+; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:2005
+; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:2006
+; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:2007
+; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:2003
+; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:2008
+; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2009
+; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:2010
+; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:2011
+; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:2012
+; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:2013
+; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:2014
+; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:2015
+; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(58)
+; ALIGNED-NEXT: v_lshl_or_b32 v112, v43, 8, v46
+; ALIGNED-NEXT: s_waitcnt vmcnt(54)
+; ALIGNED-NEXT: v_lshl_or_b32 v42, v117, 8, v116
+; ALIGNED-NEXT: s_waitcnt vmcnt(47)
+; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v79, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(44)
+; ALIGNED-NEXT: buffer_store_dword v108, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(43)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(41)
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(40)
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(39)
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(33)
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v82, 8, v4
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:1968
+; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:1969
+; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:1970
+; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:1971
+; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:1972
+; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:1973
+; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:1974
+; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:1975
+; ALIGNED-NEXT: s_waitcnt vmcnt(40)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:1876
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1873
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1875
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:1872
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
-; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:98
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:102
-; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:103
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v4, 8, v7
+; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1890
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v3, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v1
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 8, v6
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:94
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:95
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:1885
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1887
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:1895
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:93
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:91
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:92
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1886
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1883
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:1884
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v2
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:90
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:101
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:89
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:88
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v4
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1894
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:1881
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1882
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:1880
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 8, v4
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1893
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:1892
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:99
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:100
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:97
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:96
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1889
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1891
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:1888
+; ALIGNED-NEXT: s_waitcnt vmcnt(4)
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
-; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:114
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:118
-; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:119
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v4, 8, v7
+; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1906
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v3, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v1
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 8, v6
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:110
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:111
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:1901
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1903
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:1911
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:109
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:107
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:108
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1902
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1899
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:1900
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v2
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:106
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:117
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:105
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:104
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v4
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1910
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:1897
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1898
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:1896
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 8, v4
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1909
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:1908
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:115
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:116
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:113
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:112
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1905
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1907
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:1904
+; ALIGNED-NEXT: s_waitcnt vmcnt(4)
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
-; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:130
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:134
-; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:135
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v4, 8, v7
+; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1922
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v3, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v1
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 8, v6
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:126
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:127
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:1917
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1919
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:1927
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:125
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:123
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:124
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1918
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1915
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:1916
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v2
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:122
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:133
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:121
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:120
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v4
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1926
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:1913
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1914
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1232 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:1912
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1244 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1236 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1236 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 8, v4
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1925
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:1924
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:131
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:132
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:129
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:128
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1921
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1923
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:1920
+; ALIGNED-NEXT: s_waitcnt vmcnt(4)
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1256 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1256 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
-; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:146
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5
-; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:150
-; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:151
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v4, 8, v7
+; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:1938
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v3, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1244 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v1
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 8, v6
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:142
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:143
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:1933
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1935
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:1943
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:141
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:139
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:140
-; ALIGNED-NEXT: s_waitcnt vmcnt(6)
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1934
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1931
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:1932
; ALIGNED-NEXT: s_waitcnt vmcnt(5)
-; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v2
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:138
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:149
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:137
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:136
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v4
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1942
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:1929
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1930
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:1928
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 8, v4
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1941
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:1940
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:147
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:148
-; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:145
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:144
+; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:1937
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1939
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:1936
+; ALIGNED-NEXT: s_waitcnt vmcnt(4)
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v4, 8, v7
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7
-; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:159
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v12
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2
-; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:158
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v14, 8, v18
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:157
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:156
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v3, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v1
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 8, v6
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:1949
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:1951
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: s_clause 0x1
+; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1950
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:1947
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:1948
+; ALIGNED-NEXT: s_waitcnt vmcnt(4)
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v6, 8, v4
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v100, 8, v101
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v86, 8, v96
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 8, v123
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v94
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v108
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v104
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v1, 8, v0
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v122, 8, v124
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 8, v95
+; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:2000
+; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2001
+; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:2002
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v76, 8, v93
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v63, 8, v61
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v120, 8, v89
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v59, 8, v72
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v106
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v76
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v44, 8, v104
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v118, 8, v40
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 8, v78
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v62
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v87, 8, v115
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v102, 8, v97
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v47
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v81, 8, v83
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v70, 8, v69
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v58
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v82
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v80
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v60
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v41, 8, v42
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v66
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v55, 8, v64
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v116
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v52, 8, v53
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v54, 8, v51
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:248
-; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:249
-; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v38, 8, v49
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v34, 8, v35
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v112, 8, v115
-; ALIGNED-NEXT: v_lshl_or_b32 v107, v2, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v103, 8, v113
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v99, 8, v102
-; ALIGNED-NEXT: v_lshl_or_b32 v90, v2, 16, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v87, 8, v97
-; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen
-; ALIGNED-NEXT: v_lshl_or_b32 v79, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v83, 8, v84
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v85, 8, v82
-; ALIGNED-NEXT: v_lshl_or_b32 v72, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v67, 8, v80
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v70
-; ALIGNED-NEXT: v_lshl_or_b32 v45, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v66, 8, v71
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v52, 8, v53
-; ALIGNED-NEXT: v_lshl_or_b32 v117, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v55, 8, v69
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54
-; ALIGNED-NEXT: v_lshl_or_b32 v114, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v38, 8, v49
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v39
-; ALIGNED-NEXT: v_lshl_or_b32 v98, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v35, 8, v37
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v32, 8, v31
-; ALIGNED-NEXT: v_lshl_or_b32 v81, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v33, 8, v36
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v30, 8, v34
-; ALIGNED-NEXT: v_lshl_or_b32 v68, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v27, 8, v29
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v26, 8, v28
-; ALIGNED-NEXT: v_lshl_or_b32 v50, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v23, 8, v24
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v25, 8, v22
-; ALIGNED-NEXT: v_lshl_or_b32 v48, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v19, 8, v21
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v16, 8, v15
-; ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v17, 8, v20
-; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v11, 8, v13
-; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 16, v4
+; ALIGNED-NEXT: s_clause 0x7
+; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:2016
+; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:2017
+; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:2018
+; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:2019
+; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:2020
+; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:2021
+; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:2022
+; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:2023
+; ALIGNED-NEXT: v_lshl_or_b32 v74, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v37, 8, v39
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v33, 8, v36
+; ALIGNED-NEXT: v_lshl_or_b32 v56, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v29, 8, v31
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v28, 8, v30
+; ALIGNED-NEXT: v_lshl_or_b32 v47, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v25, 8, v26
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v27, 8, v24
+; ALIGNED-NEXT: v_lshl_or_b32 v100, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v19, 8, v22
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v18, 8, v20
+; ALIGNED-NEXT: v_lshl_or_b32 v67, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v16, 8, v21
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v11
+; ALIGNED-NEXT: v_lshl_or_b32 v50, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v14, 8, v17
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v9, 8, v15
+; ALIGNED-NEXT: v_lshl_or_b32 v48, v4, 16, v3
+; ALIGNED-NEXT: s_waitcnt vmcnt(9)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v8
+; ALIGNED-NEXT: s_waitcnt vmcnt(8)
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v13, 8, v6
+; ALIGNED-NEXT: v_lshl_or_b32 v32, v4, 16, v3
+; ALIGNED-NEXT: s_waitcnt vmcnt(6)
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v125, 8, v0
+; ALIGNED-NEXT: s_waitcnt vmcnt(4)
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v123, 8, v126
+; ALIGNED-NEXT: v_lshl_or_b32 v23, v4, 16, v3
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v7, 8, v8
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_lshl_or_b32 v88, v9, 8, v1
-; ALIGNED-NEXT: v_lshl_or_b32 v5, v88, 16, v5
-; ALIGNED-NEXT: buffer_load_ubyte v88, v6, s[0:3], 0 offen offset:1
-; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v88, v88, 8, v0
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 16, v88
-; ALIGNED-NEXT: v_lshl_or_b32 v88, v109, 8, v93
-; ALIGNED-NEXT: v_lshl_or_b32 v89, v121, 8, v110
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v121, v6, s[0:3], 0 offen offset:8
-; ALIGNED-NEXT: buffer_load_ubyte v110, v6, s[0:3], 0 offen offset:9
-; ALIGNED-NEXT: buffer_load_ubyte v109, v6, s[0:3], 0 offen offset:10
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 16, v88
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v109, 8, v121
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v107, 8, v110
+; ALIGNED-NEXT: v_lshl_or_b32 v12, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v94, 8, v105
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v90, 8, v92
+; ALIGNED-NEXT: v_lshl_or_b32 v7, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v78, 8, v77
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v88, 8, v75
+; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v62, 8, v73
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v57, 8, v60
+; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v45, 8, v58
+; ALIGNED-NEXT: v_lshl_or_b32 v4, v112, 16, v4
+; ALIGNED-NEXT: v_lshl_or_b32 v112, v119, 8, v41
+; ALIGNED-NEXT: v_lshl_or_b32 v114, v42, 16, v112
+; ALIGNED-NEXT: v_lshl_or_b32 v112, v101, 8, v103
+; ALIGNED-NEXT: v_lshl_or_b32 v42, v113, 8, v98
+; ALIGNED-NEXT: v_lshl_or_b32 v99, v42, 16, v112
; ALIGNED-NEXT: s_clause 0x1
-; ALIGNED-NEXT: buffer_load_ubyte v125, v6, s[0:3], 0 offen offset:13
-; ALIGNED-NEXT: buffer_load_ubyte v89, v6, s[0:3], 0 offen offset:14
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:12
-; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:1793
+; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:1794
+; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v89, v126, 8, v89
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_lshl_or_b32 v88, v125, 8, v0
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 16, v88
-; ALIGNED-NEXT: v_lshl_or_b32 v88, v110, 8, v121
-; ALIGNED-NEXT: v_lshl_or_b32 v89, v120, 8, v109
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 16, v88
-; ALIGNED-NEXT: s_clause 0x2
-; ALIGNED-NEXT: buffer_load_ubyte v88, v6, s[0:3], 0 offen offset:18
-; ALIGNED-NEXT: buffer_load_ubyte v93, v6, s[0:3], 0 offen offset:16
-; ALIGNED-NEXT: buffer_load_ubyte v89, v6, s[0:3], 0 offen offset:17
-; ALIGNED-NEXT: v_add_nc_u32_e32 v6, 0xffffff00, v6
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
-; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:488
-; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:492
-; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:484
-; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480
-; ALIGNED-NEXT: s_clause 0x1 ; 8-byte Folded Reload
-; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704
-; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708
-; ALIGNED-NEXT: s_waitcnt vmcnt(4)
-; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v88
+; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v112, v112, 8, v96
+; ALIGNED-NEXT: v_lshl_or_b32 v42, v79, 8, v42
+; ALIGNED-NEXT: v_lshl_or_b32 v96, v42, 16, v112
+; ALIGNED-NEXT: v_lshl_or_b32 v112, v91, 8, v86
+; ALIGNED-NEXT: v_lshl_or_b32 v42, v108, 8, v106
+; ALIGNED-NEXT: s_clause 0x4
+; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:1807
+; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:1803
+; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:1800
+; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:1801
+; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:1802
+; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:1806
+; ALIGNED-NEXT: v_lshl_or_b32 v86, v42, 16, v112
+; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:1805
+; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:1804
; ALIGNED-NEXT: s_waitcnt vmcnt(2)
-; ALIGNED-NEXT: v_lshl_or_b32 v127, v89, 8, v93
+; ALIGNED-NEXT: v_lshl_or_b32 v42, v79, 8, v96
+; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill
; ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, v2, s4
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, s5, v3, vcc_lo
-; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127
-; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, 3
-; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v3, vcc_lo
-; ALIGNED-NEXT: flat_store_byte v[4:5], v1 offset:247
-; ALIGNED-NEXT: flat_store_byte v[4:5], v9 offset:248
-; ALIGNED-NEXT: flat_store_byte v[4:5], v7 offset:246
-; ALIGNED-NEXT: flat_store_byte v[4:5], v10 offset:252
-; ALIGNED-NEXT: flat_store_byte v[4:5], v11 offset:250
-; ALIGNED-NEXT: flat_store_byte v[4:5], v12 offset:251
-; ALIGNED-NEXT: flat_store_byte v[4:5], v13 offset:249
-; ALIGNED-NEXT: flat_store_byte v[4:5], v8 offset:245
-; ALIGNED-NEXT: flat_store_byte v[4:5], v15 offset:239
-; ALIGNED-NEXT: flat_store_byte v[4:5], v16 offset:240
-; ALIGNED-NEXT: flat_store_byte v[4:5], v19 offset:238
-; ALIGNED-NEXT: flat_store_byte v[4:5], v14 offset:244
-; ALIGNED-NEXT: flat_store_byte v[4:5], v17 offset:242
-; ALIGNED-NEXT: flat_store_byte v[4:5], v18 offset:243
-; ALIGNED-NEXT: flat_store_byte v[4:5], v20 offset:241
-; ALIGNED-NEXT: flat_store_byte v[4:5], v21 offset:237
-; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:504
-; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:508
-; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:500
-; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:496
-; ALIGNED-NEXT: flat_store_byte v[4:5], v22 offset:231
-; ALIGNED-NEXT: flat_store_byte v[4:5], v25 offset:232
-; ALIGNED-NEXT: flat_store_byte v[4:5], v23 offset:230
-; ALIGNED-NEXT: flat_store_byte v[4:5], v26 offset:236
-; ALIGNED-NEXT: flat_store_byte v[4:5], v27 offset:234
-; ALIGNED-NEXT: flat_store_byte v[4:5], v28 offset:235
-; ALIGNED-NEXT: flat_store_byte v[4:5], v29 offset:233
-; ALIGNED-NEXT: flat_store_byte v[4:5], v24 offset:229
-; ALIGNED-NEXT: flat_store_byte v[4:5], v31 offset:223
-; ALIGNED-NEXT: flat_store_byte v[4:5], v32 offset:224
-; ALIGNED-NEXT: flat_store_byte v[4:5], v35 offset:222
-; ALIGNED-NEXT: flat_store_byte v[4:5], v30 offset:228
-; ALIGNED-NEXT: flat_store_byte v[4:5], v33 offset:226
-; ALIGNED-NEXT: flat_store_byte v[4:5], v34 offset:227
-; ALIGNED-NEXT: flat_store_byte v[4:5], v36 offset:225
-; ALIGNED-NEXT: flat_store_byte v[4:5], v37 offset:221
-; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:448
-; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:460
-; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:456
-; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:452
-; ALIGNED-NEXT: flat_store_byte v[4:5], v67 offset:210
-; ALIGNED-NEXT: flat_store_byte v[4:5], v64 offset:212
-; ALIGNED-NEXT: flat_store_byte v[4:5], v38 offset:206
-; ALIGNED-NEXT: flat_store_byte v[4:5], v65 offset:208
-; ALIGNED-NEXT: flat_store_byte v[4:5], v39 offset:207
-; ALIGNED-NEXT: flat_store_byte v[4:5], v70 offset:211
-; ALIGNED-NEXT: flat_store_byte v[4:5], v80 offset:209
-; ALIGNED-NEXT: flat_store_byte v[4:5], v53 offset:215
-; ALIGNED-NEXT: flat_store_byte v[4:5], v52 offset:216
-; ALIGNED-NEXT: flat_store_byte v[4:5], v66 offset:214
-; ALIGNED-NEXT: flat_store_byte v[4:5], v51 offset:220
-; ALIGNED-NEXT: flat_store_byte v[4:5], v55 offset:218
-; ALIGNED-NEXT: flat_store_byte v[4:5], v54 offset:219
-; ALIGNED-NEXT: flat_store_byte v[4:5], v69 offset:217
-; ALIGNED-NEXT: flat_store_byte v[4:5], v71 offset:213
-; ALIGNED-NEXT: flat_store_byte v[4:5], v49 offset:205
-; ALIGNED-NEXT: buffer_store_dword v72, off, s[0:3], s32 offset:472
-; ALIGNED-NEXT: buffer_store_dword v79, off, s[0:3], s32 offset:476
-; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:468
-; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:464
+; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: v_lshl_or_b32 v112, v112, 8, v86
+; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_mov_b32_e32 v86, v79
+; ALIGNED-NEXT: v_lshl_or_b32 v96, v42, 16, v112
+; ALIGNED-NEXT: v_lshl_or_b32 v112, v106, 8, v111
+; ALIGNED-NEXT: v_lshl_or_b32 v42, v108, 8, v91
+; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
+; ALIGNED-NEXT: v_lshl_or_b32 v96, v42, 16, v112
+; ALIGNED-NEXT: s_clause 0x2
+; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:1810
+; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:1808
+; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:1809
+; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0xffffff00, v2
+; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
+; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:488
+; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:492
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:484
+; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:480
+; ALIGNED-NEXT: v_add_co_u32 v3, vcc_lo, v84, 3
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v85, vcc_lo
+; ALIGNED-NEXT: flat_store_byte v[3:4], v98 offset:247
+; ALIGNED-NEXT: flat_store_byte v[3:4], v113 offset:248
+; ALIGNED-NEXT: flat_store_byte v[3:4], v101 offset:246
+; ALIGNED-NEXT: flat_store_byte v[3:4], v117 offset:252
+; ALIGNED-NEXT: flat_store_byte v[3:4], v119 offset:250
+; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:251
+; ALIGNED-NEXT: flat_store_byte v[3:4], v41 offset:249
+; ALIGNED-NEXT: flat_store_byte v[3:4], v103 offset:245
+; ALIGNED-NEXT: flat_store_byte v[3:4], v60 offset:239
+; ALIGNED-NEXT: flat_store_byte v[3:4], v57 offset:240
+; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:238
+; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:244
+; ALIGNED-NEXT: flat_store_byte v[3:4], v45 offset:242
+; ALIGNED-NEXT: flat_store_byte v[3:4], v46 offset:243
+; ALIGNED-NEXT: flat_store_byte v[3:4], v58 offset:241
+; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:237
+; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504
+; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508
+; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:500
+; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:496
+; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:231
+; ALIGNED-NEXT: flat_store_byte v[3:4], v88 offset:232
+; ALIGNED-NEXT: flat_store_byte v[3:4], v78 offset:230
+; ALIGNED-NEXT: flat_store_byte v[3:4], v90 offset:236
+; ALIGNED-NEXT: flat_store_byte v[3:4], v94 offset:234
+; ALIGNED-NEXT: flat_store_byte v[3:4], v92 offset:235
+; ALIGNED-NEXT: flat_store_byte v[3:4], v105 offset:233
+; ALIGNED-NEXT: flat_store_byte v[3:4], v77 offset:229
+; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:223
+; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:224
+; ALIGNED-NEXT: flat_store_byte v[3:4], v125 offset:222
+; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:228
+; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:226
+; ALIGNED-NEXT: flat_store_byte v[3:4], v110 offset:227
+; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:225
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:221
+; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:448
+; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:460
+; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:456
+; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:452
+; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:206
+; ALIGNED-NEXT: flat_store_byte v[3:4], v13 offset:208
+; ALIGNED-NEXT: flat_store_byte v[3:4], v6 offset:207
+; ALIGNED-NEXT: flat_store_byte v[3:4], v19 offset:210
+; ALIGNED-NEXT: flat_store_byte v[3:4], v18 offset:212
+; ALIGNED-NEXT: flat_store_byte v[3:4], v20 offset:211
+; ALIGNED-NEXT: flat_store_byte v[3:4], v22 offset:209
+; ALIGNED-NEXT: flat_store_byte v[3:4], v11 offset:215
+; ALIGNED-NEXT: flat_store_byte v[3:4], v10 offset:216
+; ALIGNED-NEXT: flat_store_byte v[3:4], v16 offset:214
+; ALIGNED-NEXT: flat_store_byte v[3:4], v9 offset:220
+; ALIGNED-NEXT: flat_store_byte v[3:4], v14 offset:218
+; ALIGNED-NEXT: flat_store_byte v[3:4], v15 offset:219
+; ALIGNED-NEXT: flat_store_byte v[3:4], v17 offset:217
+; ALIGNED-NEXT: flat_store_byte v[3:4], v21 offset:213
+; ALIGNED-NEXT: flat_store_byte v[3:4], v8 offset:205
+; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:472
+; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:476
+; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:468
+; ALIGNED-NEXT: buffer_store_dword v74, off, s[0:3], s32 offset:464
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[4:5], v82 offset:199
-; ALIGNED-NEXT: flat_store_byte v[4:5], v85 offset:200
-; ALIGNED-NEXT: flat_store_byte v[4:5], v83 offset:198
-; ALIGNED-NEXT: flat_store_byte v[4:5], v86 offset:204
-; ALIGNED-NEXT: flat_store_byte v[4:5], v87 offset:202
-; ALIGNED-NEXT: flat_store_byte v[4:5], v96 offset:203
-; ALIGNED-NEXT: flat_store_byte v[4:5], v97 offset:201
-; ALIGNED-NEXT: flat_store_byte v[4:5], v84 offset:197
-; ALIGNED-NEXT: flat_store_byte v[4:5], v101 offset:191
-; ALIGNED-NEXT: flat_store_byte v[4:5], v100 offset:192
-; ALIGNED-NEXT: flat_store_byte v[4:5], v112 offset:190
-; ALIGNED-NEXT: flat_store_byte v[4:5], v99 offset:196
-; ALIGNED-NEXT: flat_store_byte v[4:5], v103 offset:194
-; ALIGNED-NEXT: flat_store_byte v[4:5], v102 offset:195
-; ALIGNED-NEXT: flat_store_byte v[4:5], v113 offset:193
-; ALIGNED-NEXT: flat_store_byte v[4:5], v115 offset:189
-; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00
-; ALIGNED-NEXT: s_addc_u32 s5, s5, -1
-; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7]
+; ALIGNED-NEXT: flat_store_byte v[3:4], v24 offset:199
+; ALIGNED-NEXT: flat_store_byte v[3:4], v27 offset:200
+; ALIGNED-NEXT: flat_store_byte v[3:4], v25 offset:198
+; ALIGNED-NEXT: flat_store_byte v[3:4], v28 offset:204
+; ALIGNED-NEXT: flat_store_byte v[3:4], v29 offset:202
+; ALIGNED-NEXT: flat_store_byte v[3:4], v30 offset:203
+; ALIGNED-NEXT: flat_store_byte v[3:4], v31 offset:201
+; ALIGNED-NEXT: flat_store_byte v[3:4], v26 offset:197
+; ALIGNED-NEXT: flat_store_byte v[3:4], v35 offset:191
+; ALIGNED-NEXT: flat_store_byte v[3:4], v34 offset:192
+; ALIGNED-NEXT: flat_store_byte v[3:4], v38 offset:190
+; ALIGNED-NEXT: flat_store_byte v[3:4], v33 offset:196
+; ALIGNED-NEXT: flat_store_byte v[3:4], v37 offset:194
+; ALIGNED-NEXT: flat_store_byte v[3:4], v36 offset:195
+; ALIGNED-NEXT: flat_store_byte v[3:4], v39 offset:193
+; ALIGNED-NEXT: flat_store_byte v[3:4], v49 offset:189
+; ALIGNED-NEXT: s_waitcnt vmcnt(3)
+; ALIGNED-NEXT: v_lshl_or_b32 v96, v127, 8, v112
+; ALIGNED-NEXT: s_waitcnt vmcnt(1)
+; ALIGNED-NEXT: v_lshl_or_b32 v127, v42, 8, v79
+; ALIGNED-NEXT: v_lshl_or_b32 v96, v96, 16, v127
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
@@ -15184,22 +15209,22 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[4:5], v116 offset:183
-; ALIGNED-NEXT: flat_store_byte v[4:5], v40 offset:184
-; ALIGNED-NEXT: flat_store_byte v[4:5], v118 offset:182
-; ALIGNED-NEXT: flat_store_byte v[4:5], v41 offset:188
-; ALIGNED-NEXT: flat_store_byte v[4:5], v43 offset:186
-; ALIGNED-NEXT: flat_store_byte v[4:5], v42 offset:187
-; ALIGNED-NEXT: flat_store_byte v[4:5], v44 offset:185
-; ALIGNED-NEXT: flat_store_byte v[4:5], v119 offset:181
-; ALIGNED-NEXT: flat_store_byte v[4:5], v47 offset:175
-; ALIGNED-NEXT: flat_store_byte v[4:5], v56 offset:176
-; ALIGNED-NEXT: flat_store_byte v[4:5], v59 offset:174
-; ALIGNED-NEXT: flat_store_byte v[4:5], v46 offset:180
-; ALIGNED-NEXT: flat_store_byte v[4:5], v57 offset:178
-; ALIGNED-NEXT: flat_store_byte v[4:5], v58 offset:179
-; ALIGNED-NEXT: flat_store_byte v[4:5], v60 offset:177
-; ALIGNED-NEXT: flat_store_byte v[4:5], v61 offset:173
+; ALIGNED-NEXT: flat_store_byte v[3:4], v51 offset:183
+; ALIGNED-NEXT: flat_store_byte v[3:4], v54 offset:184
+; ALIGNED-NEXT: flat_store_byte v[3:4], v52 offset:182
+; ALIGNED-NEXT: flat_store_byte v[3:4], v55 offset:188
+; ALIGNED-NEXT: flat_store_byte v[3:4], v65 offset:186
+; ALIGNED-NEXT: flat_store_byte v[3:4], v64 offset:187
+; ALIGNED-NEXT: flat_store_byte v[3:4], v66 offset:185
+; ALIGNED-NEXT: flat_store_byte v[3:4], v53 offset:181
+; ALIGNED-NEXT: flat_store_byte v[3:4], v69 offset:175
+; ALIGNED-NEXT: flat_store_byte v[3:4], v70 offset:176
+; ALIGNED-NEXT: flat_store_byte v[3:4], v81 offset:174
+; ALIGNED-NEXT: flat_store_byte v[3:4], v68 offset:180
+; ALIGNED-NEXT: flat_store_byte v[3:4], v71 offset:178
+; ALIGNED-NEXT: flat_store_byte v[3:4], v80 offset:179
+; ALIGNED-NEXT: flat_store_byte v[3:4], v82 offset:177
+; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:173
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
@@ -15211,597 +15236,603 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[4:5], v62 offset:167
-; ALIGNED-NEXT: flat_store_byte v[4:5], v74 offset:168
-; ALIGNED-NEXT: flat_store_byte v[4:5], v63 offset:166
-; ALIGNED-NEXT: flat_store_byte v[4:5], v75 offset:172
-; ALIGNED-NEXT: flat_store_byte v[4:5], v77 offset:170
-; ALIGNED-NEXT: flat_store_byte v[4:5], v76 offset:171
-; ALIGNED-NEXT: flat_store_byte v[4:5], v78 offset:169
-; ALIGNED-NEXT: flat_store_byte v[4:5], v73 offset:165
-; ALIGNED-NEXT: flat_store_byte v[4:5], v94 offset:159
-; ALIGNED-NEXT: flat_store_byte v[4:5], v92 offset:160
-; ALIGNED-NEXT: flat_store_byte v[4:5], v105 offset:158
-; ALIGNED-NEXT: flat_store_byte v[4:5], v91 offset:164
-; ALIGNED-NEXT: flat_store_byte v[4:5], v95 offset:162
-; ALIGNED-NEXT: flat_store_byte v[4:5], v104 offset:163
-; ALIGNED-NEXT: flat_store_byte v[4:5], v106 offset:161
-; ALIGNED-NEXT: flat_store_byte v[4:5], v108 offset:157
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v97 offset:167
+; ALIGNED-NEXT: flat_store_byte v[3:4], v102 offset:168
+; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:166
+; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:172
+; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:170
+; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:171
+; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:169
+; ALIGNED-NEXT: flat_store_byte v[3:4], v115 offset:165
+; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:159
+; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:160
+; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:158
+; ALIGNED-NEXT: flat_store_byte v[3:4], v59 offset:164
+; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:162
+; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:163
+; ALIGNED-NEXT: flat_store_byte v[3:4], v89 offset:161
+; ALIGNED-NEXT: flat_store_byte v[3:4], v93 offset:157
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[4:5], v111 offset:151
-; ALIGNED-NEXT: flat_store_byte v[4:5], v124 offset:152
-; ALIGNED-NEXT: flat_store_byte v[4:5], v122 offset:150
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v95 offset:151
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:156
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:152
+; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:150
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:156
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:154
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:154
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:155
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:155
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:153
-; ALIGNED-NEXT: flat_store_byte v[4:5], v123 offset:149
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:153
+; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:149
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:143
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:143
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:144
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:144
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:142
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:142
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:148
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:146
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:148
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:147
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:146
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:145
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:147
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:141
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:145
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:141
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:135
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:136
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:134
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:135
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:140
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:136
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:138
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:134
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:139
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:140
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:137
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:138
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:133
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:139
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:137
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:127
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:133
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:128
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1256 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:127
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:126
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:128
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:132
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:126
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:130
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:131
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:132
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:129
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:130
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:128
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:131
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1256 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:129
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1244 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[84:85], v0 offset:128
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1240 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1244 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:119
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1240 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:120
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1236 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:118
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:119
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:124
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:120
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:122
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:118
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:123
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:124
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:121
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:122
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:117
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:123
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:121
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:111
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:117
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:112
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:111
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:110
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:112
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:116
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:110
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:114
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:115
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:116
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:113
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:114
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:109
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:115
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:113
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:109
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:103
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:104
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:102
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:103
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:108
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:104
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:106
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:102
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:107
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:108
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:105
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:106
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:107
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:101
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:105
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:95
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:101
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:96
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:95
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:94
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:96
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:100
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:94
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:98
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:99
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:100
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:97
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:98
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:93
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:99
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:97
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:93
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:87
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:88
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:86
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:87
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:92
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:88
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:90
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:86
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:91
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:92
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:89
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:90
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:85
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:91
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:89
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:79
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:85
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:80
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:79
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:78
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:80
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:84
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:78
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:82
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:83
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:84
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:81
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:82
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:77
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:83
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:81
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:77
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:71
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:72
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:70
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:71
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:76
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:72
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:74
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:70
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:75
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:76
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:73
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:74
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:69
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:75
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:63
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:73
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:64
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:69
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:62
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:63
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:68
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:64
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:66
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:62
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:67
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:68
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:65
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:66
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:64
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:67
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:65
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[84:85], v0 offset:64
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:58
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:55
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:56
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:58
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:54
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:55
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:60
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:56
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:59
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:54
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:57
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:60
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:53
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:59
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:50
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:57
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:47
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:53
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:48
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:50
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:46
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:47
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:48
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:52
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:46
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:51
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:52
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:49
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:51
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:45
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:49
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:45
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:40
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:39
+; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:38
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:37
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:40
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:44
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:43
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:39
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:42
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:38
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:41
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:37
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:32
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:44
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:31
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:43
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:30
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:42
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:32
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:41
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:36
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:32
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:35
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:31
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:34
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:30
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:33
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[84:85], v0 offset:32
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:36
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:35
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:34
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:33
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644
-; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:640
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:23
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:24
+; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:640
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:22
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:23
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:28
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:26
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:24
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:27
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:22
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:28
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:25
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:26
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:21
-; ALIGNED-NEXT: flat_store_byte v[4:5], v88 offset:15
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:27
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:16
-; ALIGNED-NEXT: flat_store_byte v[4:5], v89 offset:14
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:25
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:20
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:21
+; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:15
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:16
+; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:14
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:18
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:20
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:19
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:18
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:17
-; ALIGNED-NEXT: flat_store_byte v[2:3], v93 offset:16
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:19
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:17
+; ALIGNED-NEXT: flat_store_byte v[84:85], v79 offset:16
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload
-; ALIGNED-NEXT: flat_store_byte v[4:5], v109 offset:7
-; ALIGNED-NEXT: flat_store_byte v[4:5], v120 offset:8
-; ALIGNED-NEXT: flat_store_byte v[4:5], v125 offset:10
-; ALIGNED-NEXT: flat_store_byte v[4:5], v110 offset:6
-; ALIGNED-NEXT: flat_store_byte v[4:5], v126 offset:12
-; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:11
; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v91 offset:7
+; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:8
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:9
-; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:8
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:10
+; ALIGNED-NEXT: flat_store_byte v[3:4], v106 offset:6
+; ALIGNED-NEXT: flat_store_byte v[3:4], v86 offset:12
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:2
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:11
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:9
+; ALIGNED-NEXT: flat_store_byte v[84:85], v111 offset:8
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:1
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[84:85], v0 offset:2
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:4
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[84:85], v0 offset:1
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:2
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[84:85], v0 offset:4
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:3
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[84:85], v0
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload
+; ALIGNED-NEXT: v_add_co_u32 v84, vcc_lo, 0xffffff00, v84
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v85, null, -1, v85, vcc_lo
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:4
-; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:4
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload
+; ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:2
+; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload
; ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; ALIGNED-NEXT: flat_store_byte v[2:3], v0
-; ALIGNED-NEXT: s_cbranch_scc0 .LBB9_4
-; ALIGNED-NEXT: .LBB9_5: ; %Flow11
-; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:3
+; ALIGNED-NEXT: s_cbranch_scc0 .LBB9_5
+; ALIGNED-NEXT: .LBB9_6: ; %Flow17
+; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s6
; ALIGNED-NEXT: s_clause 0x2f ; 192-byte Folded Reload
; ALIGNED-NEXT: buffer_load_dword v127, off, s[0:3], s32
; ALIGNED-NEXT: buffer_load_dword v126, off, s[0:3], s32 offset:4
@@ -15858,42 +15889,44 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; UNROLL3: ; %bb.0: ; %entry
; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; UNROLL3-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; UNROLL3-NEXT: s_mov_b64 s[4:5], 0
-; UNROLL3-NEXT: s_mov_b32 s6, exec_lo
+; UNROLL3-NEXT: s_mov_b32 s4, exec_lo
; UNROLL3-NEXT: v_cndmask_b32_e32 v3, -1, v0, vcc_lo
; UNROLL3-NEXT: v_cmpx_ge_u32_e64 v2, v3
-; UNROLL3-NEXT: s_xor_b32 s6, exec_lo, s6
+; UNROLL3-NEXT: s_xor_b32 s6, exec_lo, s4
; UNROLL3-NEXT: s_cbranch_execz .LBB9_4
; UNROLL3-NEXT: ; %bb.1: ; %memmove_fwd_loop.preheader
-; UNROLL3-NEXT: v_mov_b32_e32 v3, v2
+; UNROLL3-NEXT: v_mov_b32_e32 v4, v1
+; UNROLL3-NEXT: v_mov_b32_e32 v3, v0
+; UNROLL3-NEXT: v_mov_b32_e32 v5, v2
+; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7e0
; UNROLL3-NEXT: s_inst_prefetch 0x1
; UNROLL3-NEXT: .p2align 6
; UNROLL3-NEXT: .LBB9_2: ; %memmove_fwd_loop
; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1
; UNROLL3-NEXT: s_clause 0xb
-; UNROLL3-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen
-; UNROLL3-NEXT: buffer_load_dword v5, v3, s[0:3], 0 offen offset:4
-; UNROLL3-NEXT: buffer_load_dword v6, v3, s[0:3], 0 offen offset:8
-; UNROLL3-NEXT: buffer_load_dword v7, v3, s[0:3], 0 offen offset:12
-; UNROLL3-NEXT: buffer_load_dword v8, v3, s[0:3], 0 offen offset:16
-; UNROLL3-NEXT: buffer_load_dword v9, v3, s[0:3], 0 offen offset:20
-; UNROLL3-NEXT: buffer_load_dword v10, v3, s[0:3], 0 offen offset:24
-; UNROLL3-NEXT: buffer_load_dword v11, v3, s[0:3], 0 offen offset:28
-; UNROLL3-NEXT: buffer_load_dword v12, v3, s[0:3], 0 offen offset:32
-; UNROLL3-NEXT: buffer_load_dword v13, v3, s[0:3], 0 offen offset:36
-; UNROLL3-NEXT: buffer_load_dword v14, v3, s[0:3], 0 offen offset:40
-; UNROLL3-NEXT: buffer_load_dword v15, v3, s[0:3], 0 offen offset:44
-; UNROLL3-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo
-; UNROLL3-NEXT: s_add_u32 s4, s4, 48
-; UNROLL3-NEXT: v_add_nc_u32_e32 v3, 48, v3
-; UNROLL3-NEXT: s_addc_u32 s5, s5, 0
+; UNROLL3-NEXT: buffer_load_dword v6, v5, s[0:3], 0 offen
+; UNROLL3-NEXT: buffer_load_dword v7, v5, s[0:3], 0 offen offset:4
+; UNROLL3-NEXT: buffer_load_dword v8, v5, s[0:3], 0 offen offset:8
+; UNROLL3-NEXT: buffer_load_dword v9, v5, s[0:3], 0 offen offset:12
+; UNROLL3-NEXT: buffer_load_dword v10, v5, s[0:3], 0 offen offset:16
+; UNROLL3-NEXT: buffer_load_dword v11, v5, s[0:3], 0 offen offset:20
+; UNROLL3-NEXT: buffer_load_dword v12, v5, s[0:3], 0 offen offset:24
+; UNROLL3-NEXT: buffer_load_dword v13, v5, s[0:3], 0 offen offset:28
+; UNROLL3-NEXT: buffer_load_dword v14, v5, s[0:3], 0 offen offset:32
+; UNROLL3-NEXT: buffer_load_dword v15, v5, s[0:3], 0 offen offset:36
+; UNROLL3-NEXT: buffer_load_dword v16, v5, s[0:3], 0 offen offset:40
+; UNROLL3-NEXT: buffer_load_dword v17, v5, s[0:3], 0 offen offset:44
+; UNROLL3-NEXT: v_add_nc_u32_e32 v5, 48, v5
+; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0
+; UNROLL3-NEXT: s_addc_u32 s5, s5, -1
; UNROLL3-NEXT: s_waitcnt vmcnt(4)
-; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[8:11] offset:16
-; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[4:7]
+; UNROLL3-NEXT: flat_store_dwordx4 v[3:4], v[10:13] offset:16
+; UNROLL3-NEXT: flat_store_dwordx4 v[3:4], v[6:9]
; UNROLL3-NEXT: s_waitcnt vmcnt(0)
-; UNROLL3-NEXT: flat_store_dwordx4 v[16:17], v[12:15] offset:32
-; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0x7e0
+; UNROLL3-NEXT: flat_store_dwordx4 v[3:4], v[14:17] offset:32
+; UNROLL3-NEXT: v_add_co_u32 v3, vcc_lo, v3, 48
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v4, vcc_lo
+; UNROLL3-NEXT: s_cmp_lg_u64 s[4:5], 0
; UNROLL3-NEXT: s_cbranch_scc1 .LBB9_2
; UNROLL3-NEXT: ; %bb.3: ; %memmove_fwd_residual
; UNROLL3-NEXT: s_inst_prefetch 0x2
@@ -15912,9 +15945,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; UNROLL3-NEXT: ; implicit-def: $vgpr2
; UNROLL3-NEXT: s_waitcnt vmcnt(0)
; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:2032
-; UNROLL3-NEXT: ; implicit-def: $vgpr0_vgpr1
-; UNROLL3-NEXT: .LBB9_4: ; %Flow8
-; UNROLL3-NEXT: s_andn2_saveexec_b32 s8, s6
+; UNROLL3-NEXT: ; implicit-def: $vgpr0
+; UNROLL3-NEXT: .LBB9_4: ; %Flow14
+; UNROLL3-NEXT: s_andn2_saveexec_b32 s6, s6
; UNROLL3-NEXT: s_cbranch_execz .LBB9_7
; UNROLL3-NEXT: ; %bb.5: ; %memmove_bwd_residual
; UNROLL3-NEXT: s_clause 0x3
@@ -15922,51 +15955,51 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
; UNROLL3-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:2036
; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:2040
; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:2044
-; UNROLL3-NEXT: s_movk_i32 s6, 0xffd0
-; UNROLL3-NEXT: s_mov_b64 s[4:5], 0x7b0
-; UNROLL3-NEXT: s_mov_b32 s7, -1
+; UNROLL3-NEXT: s_movk_i32 s4, 0xf820
+; UNROLL3-NEXT: s_mov_b32 s5, -1
; UNROLL3-NEXT: s_waitcnt vmcnt(0)
; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:2032
; UNROLL3-NEXT: s_clause 0x3
-; UNROLL3-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:2016
-; UNROLL3-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:2020
-; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:2024
-; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:2028
-; UNROLL3-NEXT: v_add_nc_u32_e32 v2, 0x7b0, v2
+; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:2016
+; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:2020
+; UNROLL3-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:2024
+; UNROLL3-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:2028
+; UNROLL3-NEXT: v_add_co_u32 v3, vcc_lo, 0x7b0, v0
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
; UNROLL3-NEXT: s_waitcnt vmcnt(0)
-; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:2016
+; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[5:8] offset:2016
; UNROLL3-NEXT: s_inst_prefetch 0x1
; UNROLL3-NEXT: .p2align 6
; UNROLL3-NEXT: .LBB9_6: ; %memmove_bwd_loop
; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1
; UNROLL3-NEXT: s_clause 0xb
-; UNROLL3-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
-; UNROLL3-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
-; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; UNROLL3-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; UNROLL3-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; UNROLL3-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
-; UNROLL3-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
-; UNROLL3-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:32
-; UNROLL3-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:36
-; UNROLL3-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:40
-; UNROLL3-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:44
-; UNROLL3-NEXT: v_add_co_u32 v15, vcc_lo, v0, s4
-; UNROLL3-NEXT: v_add_co_ci_u32_e64 v16, null, s5, v1, vcc_lo
+; UNROLL3-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:1968
+; UNROLL3-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:1972
+; UNROLL3-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:1976
+; UNROLL3-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:1980
+; UNROLL3-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:1984
+; UNROLL3-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:1988
+; UNROLL3-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:1992
+; UNROLL3-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:1996
+; UNROLL3-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:2000
+; UNROLL3-NEXT: buffer_load_dword v14, v2, s[0:3], 0 offen offset:2004
+; UNROLL3-NEXT: buffer_load_dword v15, v2, s[0:3], 0 offen offset:2008
+; UNROLL3-NEXT: buffer_load_dword v16, v2, s[0:3], 0 offen offset:2012
; UNROLL3-NEXT: v_subrev_nc_u32_e32 v2, 48, v2
-; UNROLL3-NEXT: s_add_u32 s4, s4, 0xffffffd0
-; UNROLL3-NEXT: s_addc_u32 s5, s5, -1
+; UNROLL3-NEXT: s_add_u32 s4, s4, 48
+; UNROLL3-NEXT: s_addc_u32 s5, s5, 0
; UNROLL3-NEXT: s_waitcnt vmcnt(4)
-; UNROLL3-NEXT: flat_store_dwordx4 v[15:16], v[7:10] offset:16
-; UNROLL3-NEXT: flat_store_dwordx4 v[15:16], v[3:6]
+; UNROLL3-NEXT: flat_store_dwordx4 v[3:4], v[9:12] offset:16
+; UNROLL3-NEXT: flat_store_dwordx4 v[3:4], v[5:8]
; UNROLL3-NEXT: s_waitcnt vmcnt(0)
-; UNROLL3-NEXT: flat_store_dwordx4 v[15:16], v[11:14] offset:32
-; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], s[6:7]
+; UNROLL3-NEXT: flat_store_dwordx4 v[3:4], v[13:16] offset:32
+; UNROLL3-NEXT: v_add_co_u32 v3, vcc_lo, 0xffffffd0, v3
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v4, vcc_lo
+; UNROLL3-NEXT: s_cmp_eq_u64 s[4:5], 0
; UNROLL3-NEXT: s_cbranch_scc0 .LBB9_6
-; UNROLL3-NEXT: .LBB9_7: ; %Flow9
+; UNROLL3-NEXT: .LBB9_7: ; %Flow15
; UNROLL3-NEXT: s_inst_prefetch 0x2
-; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; UNROLL3-NEXT: s_or_b32 exec_lo, exec_lo, s6
; UNROLL3-NEXT: s_waitcnt lgkmcnt(0)
; UNROLL3-NEXT: s_setpc_b64 s[30:31]
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-prefetch-pattern.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-prefetch-pattern.ll
index b6f5443b1ab48..7094fdb42e21a 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-prefetch-pattern.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-prefetch-pattern.ll
@@ -12,40 +12,39 @@ define amdgpu_kernel void @ds_prefetch_pattern(ptr addrspace(3) %lds, ptr addrsp
; CHECK-NEXT: s_load_b32 s1, s[4:5], 0x0 nv
; CHECK-NEXT: s_load_b32 s0, s[4:5], 0x10 nv
; CHECK-NEXT: v_and_b32_e32 v12, 0x3ff, v0
-; CHECK-NEXT: s_wait_kmcnt 0x0
+; CHECK-NEXT: v_mov_b32_e32 v4, 0
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT: v_lshl_add_u32 v1, v12, 8, s1
-; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; CHECK-NEXT: v_mov_b32_e32 v7, v4
+; CHECK-NEXT: s_wait_kmcnt 0x0
+; CHECK-NEXT: v_lshl_add_u32 v13, v12, 8, s1
; CHECK-NEXT: s_mov_b32 s1, 0
-; CHECK-NEXT: ds_load_b128 v[4:7], v1
-; CHECK-NEXT: ds_load_b128 v[8:11], v1 offset:16
-; CHECK-NEXT: v_dual_add_nc_u32 v13, 32, v1 :: v_dual_mov_b32 v1, v0
-; CHECK-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v0
+; CHECK-NEXT: ds_load_b128 v[8:11], v13
+; CHECK-NEXT: ds_load_b128 v[0:3], v13 offset:16
; CHECK-NEXT: s_wait_dscnt 0x0
; CHECK-NEXT: .LBB0_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_barrier_signal -1
-; CHECK-NEXT: s_wait_dscnt 0x1
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; CHECK-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[6:7]
-; CHECK-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5]
; CHECK-NEXT: s_add_co_i32 s1, s1, 1
+; CHECK-NEXT: s_wait_dscnt 0x1
+; CHECK-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[10:11]
+; CHECK-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[8:9]
+; CHECK-NEXT: v_lshl_add_u32 v14, s1, 5, v13
; CHECK-NEXT: s_cmp_lt_i32 s1, s0
; CHECK-NEXT: s_wait_dscnt 0x0
-; CHECK-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[10:11]
-; CHECK-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[8:9]
+; CHECK-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
+; CHECK-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[0:1]
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; CHECK-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[2:3]
-; CHECK-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[0:1]
+; CHECK-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[6:7]
+; CHECK-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[4:5]
; CHECK-NEXT: s_barrier_wait -1
-; CHECK-NEXT: ds_load_b128 v[4:7], v13
-; CHECK-NEXT: ds_load_b128 v[8:11], v13 offset:16
-; CHECK-NEXT: v_add_nc_u32_e32 v13, 32, v13
+; CHECK-NEXT: ds_load_b128 v[8:11], v14
+; CHECK-NEXT: ds_load_b128 v[0:3], v14 offset:16
; CHECK-NEXT: s_cbranch_scc1 .LBB0_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
; CHECK-NEXT: s_wait_kmcnt 0x0
-; CHECK-NEXT: global_store_b128 v12, v[0:3], s[0:1] scale_offset
+; CHECK-NEXT: global_store_b128 v12, v[4:7], s[0:1] scale_offset
; CHECK-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-invalid-ptr-extend.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-invalid-ptr-extend.ll
index a0f4dcde7f6ab..fa7c094499e03 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-invalid-ptr-extend.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-invalid-ptr-extend.ll
@@ -12,22 +12,20 @@ define amdgpu_kernel void @scaledregtest() local_unnamed_addr {
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: loopexit:
-; CHECK-NEXT: [[SCEVGEP11_LCSSA:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP11:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SCEVGEP13_LCSSA:%.*]] = phi ptr [ [[SCEVGEP13:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: br label [[FOR_BODY_1:%.*]]
; CHECK: for.body.1:
-; CHECK-NEXT: [[LSR_IV5:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP6:%.*]], [[FOR_BODY_1]] ], [ [[SCEVGEP11_LCSSA]], [[LOOPEXIT:%.*]] ]
-; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], [[FOR_BODY_1]] ], [ [[SCEVGEP13_LCSSA]], [[LOOPEXIT]] ]
+; CHECK-NEXT: [[LSR_IV5:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP6:%.*]], [[FOR_BODY_1]] ], [ [[SCEVGEP11_LCSSA:%.*]], [[LOOPEXIT:%.*]] ]
+; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], [[FOR_BODY_1]] ], [ [[SCEVGEP13_LCSSA:%.*]], [[LOOPEXIT]] ]
; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[LSR_IV5]], align 8
; CHECK-NEXT: store ptr [[TMP0]], ptr [[LSR_IV1]], align 8
; CHECK-NEXT: [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 8
; CHECK-NEXT: [[SCEVGEP6]] = getelementptr i8, ptr addrspace(5) [[LSR_IV5]], i32 8
; CHECK-NEXT: br label [[FOR_BODY_1]]
; CHECK: for.body:
-; CHECK-NEXT: [[LSR_IV12:%.*]] = phi ptr [ [[SCEVGEP13]], [[FOR_BODY]] ], [ null, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[LSR_IV10:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP11]], [[FOR_BODY]] ], [ null, [[ENTRY]] ]
-; CHECK-NEXT: [[SCEVGEP11]] = getelementptr i8, ptr addrspace(5) [[LSR_IV10]], i32 64
-; CHECK-NEXT: [[SCEVGEP13]] = getelementptr i8, ptr [[LSR_IV12]], i64 64
+; CHECK-NEXT: [[SCEVGEP11_LCSSA]] = phi ptr addrspace(5) [ [[SCEVGEP4:%.*]], [[FOR_BODY]] ], [ inttoptr (i32 64 to ptr addrspace(5)), [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[SCEVGEP13_LCSSA]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY]] ], [ inttoptr (i64 64 to ptr), [[ENTRY]] ]
+; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[SCEVGEP13_LCSSA]], i64 64
+; CHECK-NEXT: [[SCEVGEP4]] = getelementptr i8, ptr addrspace(5) [[SCEVGEP11_LCSSA]], i32 64
; CHECK-NEXT: br i1 false, label [[LOOPEXIT]], label [[FOR_BODY]]
;
entry:
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll
index 78c2d99e830fa..2bd4d42b8ac02 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @lsr_crash_preserve_addrspace_unknown_type() #0 {
; CHECK-NEXT: br label %[[BB1:.*]]
; CHECK: [[BB1]]:
; CHECK-NEXT: [[TMP:%.*]] = phi ptr addrspace(3) [ undef, %[[BB]] ], [ [[TMP18:%.*]], %[[BB17:.*]] ]
-; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP]], i32 8
+; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr inbounds [[TMP0:%.*]], ptr addrspace(3) [[TMP]], i64 0, i32 1
; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr addrspace(3) [[SCEVGEP1]], align 8
; CHECK-NEXT: br label %[[BB4:.*]]
; CHECK: [[BB4]]:
@@ -26,14 +26,14 @@ define amdgpu_kernel void @lsr_crash_preserve_addrspace_unknown_type() #0 {
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 0, [[TMP10]]
; CHECK-NEXT: br i1 [[TMP11]], label %[[BB12:.*]], label %[[BB17]]
; CHECK: [[BB12]]:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP]], i32 16
+; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr inbounds [[TMP0]], ptr addrspace(3) [[TMP]], i64 0, i32 2
; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(3) [[SCEVGEP]], align 4
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 0, [[TMP14]]
; CHECK-NEXT: br i1 [[TMP15]], label %[[BB16:.*]], label %[[BB17]]
; CHECK: [[BB16]]:
; CHECK-NEXT: unreachable
; CHECK: [[BB17]]:
-; CHECK-NEXT: [[TMP18]] = getelementptr inbounds [[TMP0:%.*]], ptr addrspace(3) [[TMP]], i64 2
+; CHECK-NEXT: [[TMP18]] = getelementptr inbounds [[TMP0]], ptr addrspace(3) [[TMP]], i64 2
; CHECK-NEXT: br label %[[BB1]]
;
bb:
>From 2ed9ba95552b1880132d90b827c7d3eeee982dcf Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Mon, 2 Mar 2026 08:57:42 -0600
Subject: [PATCH 2/2] fixes for clang-format
---
llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 8 ++++----
llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h | 4 ++--
2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index d50dd5bbb0e0c..eb7911737acf8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -15,8 +15,8 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUTargetTransformInfo.h"
-#include "AMDGPUTargetMachine.h"
#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIModeRegisterDefaults.h"
#include "llvm/Analysis/InlineCost.h"
@@ -1706,9 +1706,9 @@ GCNTTIImpl::getInstructionUniformity(const Value *V) const {
}
InstructionCost GCNTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
- StackOffset BaseOffset,
- bool HasBaseReg, int64_t Scale,
- unsigned AddrSpace) const {
+ StackOffset BaseOffset,
+ bool HasBaseReg, int64_t Scale,
+ unsigned AddrSpace) const {
// AMDGPU has limited addressing modes. base+scale*index requires an extra
// ADD instruction, unlike architectures with rich addressing modes.
if (HasBaseReg && Scale != 0)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 24d4ec8d85d45..dc7d01533da02 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -313,8 +313,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
InstructionUniformity getInstructionUniformity(const Value *V) const override;
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
- StackOffset BaseOffset,
- bool HasBaseReg, int64_t Scale,
+ StackOffset BaseOffset, bool HasBaseReg,
+ int64_t Scale,
unsigned AddrSpace) const override;
bool isLSRCostLess(const TTI::LSRCost &A,
More information about the llvm-commits
mailing list