[llvm] a4fd3db - [AMDGPU] Use wider loop lowering type for LowerMemIntrinsics (#112332)

via llvm-commits llvm-commits at lists.llvm.org
Mon Oct 28 01:04:25 PDT 2024


Author: Fabian Ritter
Date: 2024-10-28T09:04:19+01:00
New Revision: a4fd3dba6e285734bc635b0651a30dfeffedeada

URL: https://github.com/llvm/llvm-project/commit/a4fd3dba6e285734bc635b0651a30dfeffedeada
DIFF: https://github.com/llvm/llvm-project/commit/a4fd3dba6e285734bc635b0651a30dfeffedeada.diff

LOG: [AMDGPU] Use wider loop lowering type for LowerMemIntrinsics (#112332)

When llvm.memcpy or llvm.memmove intrinsics are lowered as a loop in
LowerMemIntrinsics.cpp, the loop consists of a single load/store pair
per iteration. We can improve performance in some cases by emitting
multiple load/store pairs per iteration. This patch achieves that by
increasing the width of the loop lowering type in the GCN target and
letting legalization split the resulting too-wide access pairs into
multiple legal access pairs.

This change only affects lowered memcpys and memmoves with large (>=
1024 bytes) constant lengths. Smaller constant lengths are handled by
ISel directly; non-constant lengths would be slowed down by this change
if the dynamic length was smaller or slightly larger than what an
unrolled iteration copies.

The chosen default unroll factor is the result of microbenchmarks on
gfx1030. This change leads to speedups of 15-38% for global memory and
1.9-5.8x for scratch in these microbenchmarks.

Part of SWDEV-455845.

Added: 
    llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
    llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 8f9495d83cde2d..5160851f8c4424 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -75,6 +75,13 @@ static cl::opt<size_t> InlineMaxBB(
     cl::desc("Maximum number of BBs allowed in a function after inlining"
              " (compile time constraint)"));
 
+// This default unroll factor is based on microbenchmarks on gfx1030.
+static cl::opt<unsigned> MemcpyLoopUnroll(
+    "amdgpu-memcpy-loop-unroll",
+    cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
+             "operations when lowering memcpy as a loop"),
+    cl::init(16), cl::Hidden);
+
 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
                               unsigned Depth = 0) {
   const Instruction *I = dyn_cast<Instruction>(Cond);
@@ -409,13 +416,8 @@ int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
   return 1024;
 }
 
-// FIXME: Really we would like to issue multiple 128-bit loads and stores per
-// iteration. Should we report a larger size and let it legalize?
-//
 // FIXME: Should we use narrower types for local/region, or account for when
 // unaligned access is legal?
-//
-// FIXME: This could use fine tuning and microbenchmarks.
 Type *GCNTTIImpl::getMemcpyLoopLoweringType(
     LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
     unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
@@ -442,9 +444,22 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType(
     return FixedVectorType::get(Type::getInt32Ty(Context), 2);
   }
 
-  // Global memory works best with 16-byte accesses. Private memory will also
-  // hit this, although they'll be decomposed.
-  return FixedVectorType::get(Type::getInt32Ty(Context), 4);
+  // Global memory works best with 16-byte accesses.
+  // If the operation has a fixed known length that is large enough, it is
+  // worthwhile to return an even wider type and let legalization lower it into
+  // multiple accesses, effectively unrolling the memcpy loop. Private memory
+  // also hits this, although accesses may be decomposed.
+  //
+  // Don't unroll if Length is not a constant, since unrolling leads to worse
+  // performance for length values that are smaller or slightly larger than the
+  // total size of the type returned here. Mitigating that would require a more
+  // complex lowering for variable-length memcpy and memmove.
+  unsigned I32EltsInVector = 4;
+  if (MemcpyLoopUnroll > 0 && isa<ConstantInt>(Length))
+    return FixedVectorType::get(Type::getInt32Ty(Context),
+                                MemcpyLoopUnroll * I32EltsInVector);
+
+  return FixedVectorType::get(Type::getInt32Ty(Context), I32EltsInVector);
 }
 
 void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
@@ -452,7 +467,6 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
     unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
     Align SrcAlign, Align DestAlign,
     std::optional<uint32_t> AtomicCpySize) const {
-  assert(RemainingBytes < 16);
 
   if (AtomicCpySize)
     BaseT::getMemcpyLoopResidualLoweringType(
@@ -462,6 +476,12 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
   Align MinAlign = std::min(SrcAlign, DestAlign);
 
   if (MinAlign != Align(2)) {
+    Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
+    while (RemainingBytes >= 16) {
+      OpsOut.push_back(I32x4Ty);
+      RemainingBytes -= 16;
+    }
+
     Type *I64Ty = Type::getInt64Ty(Context);
     while (RemainingBytes >= 8) {
       OpsOut.push_back(I64Ty);

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
index 7f23434c9dfdd6..75d4d8816fb30d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=19 %s -o - | FileCheck -check-prefix=LOOP %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=21 %s -o - | FileCheck -check-prefix=UNROLL %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=35 %s -o - | FileCheck -check-prefix=LOOP %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=37 %s -o - | FileCheck -check-prefix=UNROLL %s
 
 declare void @llvm.memcpy.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1 immarg)
 
@@ -14,104 +14,176 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src)
 ; LOOP-NEXT:    v_mov_b32_e32 v4, s0
 ; LOOP-NEXT:  .LBB0_1: ; %load-store-loop
 ; LOOP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; LOOP-NEXT:    s_waitcnt expcnt(2)
+; LOOP-NEXT:    v_add_i32_e32 v29, vcc, v2, v4
+; LOOP-NEXT:    v_addc_u32_e32 v30, vcc, v3, v5, vcc
+; LOOP-NEXT:    buffer_load_ubyte v24, v[29:30], s[0:3], 0 addr64
+; LOOP-NEXT:    buffer_load_ubyte v27, v[29:30], s[0:3], 0 addr64 offset:1
+; LOOP-NEXT:    buffer_load_ubyte v34, v[29:30], s[0:3], 0 addr64 offset:2
+; LOOP-NEXT:    buffer_load_ubyte v35, v[29:30], s[0:3], 0 addr64 offset:3
+; LOOP-NEXT:    buffer_load_ubyte v36, v[29:30], s[0:3], 0 addr64 offset:4
+; LOOP-NEXT:    buffer_load_ubyte v37, v[29:30], s[0:3], 0 addr64 offset:5
+; LOOP-NEXT:    buffer_load_ubyte v38, v[29:30], s[0:3], 0 addr64 offset:6
+; LOOP-NEXT:    buffer_load_ubyte v39, v[29:30], s[0:3], 0 addr64 offset:7
+; LOOP-NEXT:    buffer_load_ubyte v6, v[29:30], s[0:3], 0 addr64 offset:8
+; LOOP-NEXT:    buffer_load_ubyte v9, v[29:30], s[0:3], 0 addr64 offset:9
+; LOOP-NEXT:    buffer_load_ubyte v10, v[29:30], s[0:3], 0 addr64 offset:10
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
-; LOOP-NEXT:    v_add_i32_e32 v6, vcc, v2, v4
-; LOOP-NEXT:    v_addc_u32_e32 v7, vcc, v3, v5, vcc
-; LOOP-NEXT:    v_add_i32_e32 v8, vcc, v0, v4
-; LOOP-NEXT:    v_addc_u32_e32 v9, vcc, v1, v5, vcc
-; LOOP-NEXT:    v_add_i32_e32 v4, vcc, 16, v4
-; LOOP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; LOOP-NEXT:    buffer_load_ubyte v10, v[6:7], s[0:3], 0 addr64
-; LOOP-NEXT:    buffer_load_ubyte v11, v[6:7], s[0:3], 0 addr64 offset:1
-; LOOP-NEXT:    buffer_load_ubyte v12, v[6:7], s[0:3], 0 addr64 offset:2
-; LOOP-NEXT:    buffer_load_ubyte v13, v[6:7], s[0:3], 0 addr64 offset:3
-; LOOP-NEXT:    buffer_load_ubyte v14, v[6:7], s[0:3], 0 addr64 offset:4
-; LOOP-NEXT:    buffer_load_ubyte v15, v[6:7], s[0:3], 0 addr64 offset:5
-; LOOP-NEXT:    buffer_load_ubyte v16, v[6:7], s[0:3], 0 addr64 offset:6
-; LOOP-NEXT:    buffer_load_ubyte v17, v[6:7], s[0:3], 0 addr64 offset:7
-; LOOP-NEXT:    buffer_load_ubyte v18, v[6:7], s[0:3], 0 addr64 offset:8
-; LOOP-NEXT:    buffer_load_ubyte v19, v[6:7], s[0:3], 0 addr64 offset:9
-; LOOP-NEXT:    buffer_load_ubyte v20, v[6:7], s[0:3], 0 addr64 offset:10
-; LOOP-NEXT:    buffer_load_ubyte v21, v[6:7], s[0:3], 0 addr64 offset:11
-; LOOP-NEXT:    buffer_load_ubyte v22, v[6:7], s[0:3], 0 addr64 offset:12
-; LOOP-NEXT:    buffer_load_ubyte v23, v[6:7], s[0:3], 0 addr64 offset:13
-; LOOP-NEXT:    buffer_load_ubyte v24, v[6:7], s[0:3], 0 addr64 offset:14
-; LOOP-NEXT:    buffer_load_ubyte v6, v[6:7], s[0:3], 0 addr64 offset:15
-; LOOP-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v4
+; LOOP-NEXT:    buffer_load_ubyte v11, v[29:30], s[0:3], 0 addr64 offset:11
+; LOOP-NEXT:    buffer_load_ubyte v7, v[29:30], s[0:3], 0 addr64 offset:12
+; LOOP-NEXT:    buffer_load_ubyte v13, v[29:30], s[0:3], 0 addr64 offset:13
+; LOOP-NEXT:    buffer_load_ubyte v14, v[29:30], s[0:3], 0 addr64 offset:14
+; LOOP-NEXT:    buffer_load_ubyte v15, v[29:30], s[0:3], 0 addr64 offset:15
+; LOOP-NEXT:    buffer_load_ubyte v8, v[29:30], s[0:3], 0 addr64 offset:16
+; LOOP-NEXT:    buffer_load_ubyte v17, v[29:30], s[0:3], 0 addr64 offset:17
+; LOOP-NEXT:    buffer_load_ubyte v18, v[29:30], s[0:3], 0 addr64 offset:18
+; LOOP-NEXT:    buffer_load_ubyte v19, v[29:30], s[0:3], 0 addr64 offset:19
+; LOOP-NEXT:    buffer_load_ubyte v12, v[29:30], s[0:3], 0 addr64 offset:20
+; LOOP-NEXT:    buffer_load_ubyte v21, v[29:30], s[0:3], 0 addr64 offset:21
+; LOOP-NEXT:    buffer_load_ubyte v22, v[29:30], s[0:3], 0 addr64 offset:22
+; LOOP-NEXT:    buffer_load_ubyte v23, v[29:30], s[0:3], 0 addr64 offset:23
+; LOOP-NEXT:    buffer_load_ubyte v16, v[29:30], s[0:3], 0 addr64 offset:24
+; LOOP-NEXT:    buffer_load_ubyte v25, v[29:30], s[0:3], 0 addr64 offset:25
+; LOOP-NEXT:    buffer_load_ubyte v26, v[29:30], s[0:3], 0 addr64 offset:26
+; LOOP-NEXT:    buffer_load_ubyte v28, v[29:30], s[0:3], 0 addr64 offset:27
+; LOOP-NEXT:    buffer_load_ubyte v20, v[29:30], s[0:3], 0 addr64 offset:28
+; LOOP-NEXT:    buffer_load_ubyte v31, v[29:30], s[0:3], 0 addr64 offset:29
+; LOOP-NEXT:    buffer_load_ubyte v32, v[29:30], s[0:3], 0 addr64 offset:30
+; LOOP-NEXT:    buffer_load_ubyte v33, v[29:30], s[0:3], 0 addr64 offset:31
 ; LOOP-NEXT:    s_waitcnt vmcnt(14)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v7, 8, v11
+; LOOP-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
+; LOOP-NEXT:    v_or_b32_e32 v24, v27, v24
+; LOOP-NEXT:    v_lshlrev_b32_e32 v27, 24, v35
+; LOOP-NEXT:    v_lshlrev_b32_e32 v29, 16, v34
+; LOOP-NEXT:    v_or_b32_e32 v27, v27, v29
+; LOOP-NEXT:    v_lshlrev_b32_e32 v29, 8, v37
+; LOOP-NEXT:    v_lshlrev_b32_e32 v30, 24, v39
+; LOOP-NEXT:    v_lshlrev_b32_e32 v34, 16, v38
+; LOOP-NEXT:    v_or_b32_e32 v29, v29, v36
+; LOOP-NEXT:    v_or_b32_e32 v30, v30, v34
+; LOOP-NEXT:    v_add_i32_e32 v34, vcc, v0, v4
+; LOOP-NEXT:    v_addc_u32_e32 v35, vcc, v1, v5, vcc
+; LOOP-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
+; LOOP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; LOOP-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v4
+; LOOP-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; LOOP-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; LOOP-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; LOOP-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; LOOP-NEXT:    v_lshlrev_b32_e32 v15, 24, v15
+; LOOP-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; LOOP-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
 ; LOOP-NEXT:    s_waitcnt vmcnt(12)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v11, 24, v13
-; LOOP-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; LOOP-NEXT:    v_lshlrev_b32_e32 v19, 24, v19
+; LOOP-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; LOOP-NEXT:    s_waitcnt vmcnt(10)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v13, 8, v15
+; LOOP-NEXT:    v_lshlrev_b32_e32 v21, 8, v21
 ; LOOP-NEXT:    s_waitcnt vmcnt(8)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v15, 24, v17
-; LOOP-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; LOOP-NEXT:    v_lshlrev_b32_e32 v23, 24, v23
+; LOOP-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
 ; LOOP-NEXT:    s_waitcnt vmcnt(6)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v17, 8, v19
+; LOOP-NEXT:    v_lshlrev_b32_e32 v25, 8, v25
 ; LOOP-NEXT:    s_waitcnt vmcnt(4)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v19, 24, v21
-; LOOP-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; LOOP-NEXT:    v_lshlrev_b32_e32 v28, 24, v28
+; LOOP-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
 ; LOOP-NEXT:    s_waitcnt vmcnt(2)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v21, 8, v23
+; LOOP-NEXT:    v_lshlrev_b32_e32 v31, 8, v31
 ; LOOP-NEXT:    s_waitcnt vmcnt(0)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; LOOP-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; LOOP-NEXT:    v_or_b32_e32 v7, v7, v10
-; LOOP-NEXT:    v_or_b32_e32 v10, v11, v12
-; LOOP-NEXT:    v_or_b32_e32 v11, v13, v14
-; LOOP-NEXT:    v_or_b32_e32 v12, v15, v16
-; LOOP-NEXT:    v_or_b32_e32 v13, v17, v18
-; LOOP-NEXT:    v_or_b32_e32 v14, v19, v20
-; LOOP-NEXT:    v_or_b32_e32 v15, v21, v22
-; LOOP-NEXT:    v_or_b32_e32 v6, v6, v23
+; LOOP-NEXT:    v_lshlrev_b32_e32 v33, 24, v33
+; LOOP-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; LOOP-NEXT:    v_or_b32_e32 v6, v9, v6
+; LOOP-NEXT:    v_or_b32_e32 v9, v11, v10
+; LOOP-NEXT:    v_or_b32_e32 v7, v13, v7
+; LOOP-NEXT:    v_or_b32_e32 v10, v15, v14
+; LOOP-NEXT:    v_or_b32_e32 v8, v17, v8
+; LOOP-NEXT:    v_or_b32_e32 v11, v19, v18
+; LOOP-NEXT:    v_or_b32_e32 v12, v21, v12
+; LOOP-NEXT:    v_or_b32_e32 v13, v23, v22
+; LOOP-NEXT:    v_or_b32_e32 v14, v25, v16
+; LOOP-NEXT:    v_or_b32_e32 v15, v28, v26
+; LOOP-NEXT:    v_or_b32_e32 v16, v31, v20
+; LOOP-NEXT:    v_or_b32_e32 v17, v33, v32
+; LOOP-NEXT:    v_or_b32_e32 v18, v27, v24
+; LOOP-NEXT:    v_or_b32_e32 v19, v30, v29
+; LOOP-NEXT:    v_or_b32_e32 v6, v9, v6
 ; LOOP-NEXT:    v_or_b32_e32 v7, v10, v7
-; LOOP-NEXT:    v_or_b32_e32 v10, v12, v11
-; LOOP-NEXT:    v_or_b32_e32 v11, v14, v13
-; LOOP-NEXT:    v_or_b32_e32 v6, v6, v15
-; LOOP-NEXT:    v_lshrrev_b32_e32 v12, 16, v7
-; LOOP-NEXT:    v_bfe_u32 v13, v7, 8, 8
-; LOOP-NEXT:    buffer_store_byte v7, v[8:9], s[0:3], 0 addr64
+; LOOP-NEXT:    v_or_b32_e32 v8, v11, v8
+; LOOP-NEXT:    v_or_b32_e32 v9, v13, v12
+; LOOP-NEXT:    v_or_b32_e32 v10, v15, v14
+; LOOP-NEXT:    v_or_b32_e32 v11, v17, v16
+; LOOP-NEXT:    v_lshrrev_b32_e32 v12, 16, v18
+; LOOP-NEXT:    v_bfe_u32 v13, v18, 8, 8
+; LOOP-NEXT:    buffer_store_byte v18, v[34:35], s[0:3], 0 addr64
+; LOOP-NEXT:    v_lshrrev_b32_e32 v14, 24, v18
+; LOOP-NEXT:    v_lshrrev_b32_e32 v15, 16, v19
+; LOOP-NEXT:    v_bfe_u32 v16, v19, 8, 8
+; LOOP-NEXT:    buffer_store_byte v19, v[34:35], s[0:3], 0 addr64 offset:4
+; LOOP-NEXT:    v_lshrrev_b32_e32 v17, 24, v19
+; LOOP-NEXT:    s_waitcnt expcnt(1)
+; LOOP-NEXT:    v_lshrrev_b32_e32 v18, 16, v6
+; LOOP-NEXT:    s_waitcnt expcnt(0)
+; LOOP-NEXT:    v_bfe_u32 v19, v6, 8, 8
+; LOOP-NEXT:    buffer_store_byte v6, v[34:35], s[0:3], 0 addr64 offset:8
+; LOOP-NEXT:    s_waitcnt expcnt(0)
+; LOOP-NEXT:    v_lshrrev_b32_e32 v6, 24, v6
+; LOOP-NEXT:    v_lshrrev_b32_e32 v20, 16, v7
+; LOOP-NEXT:    v_bfe_u32 v21, v7, 8, 8
+; LOOP-NEXT:    buffer_store_byte v7, v[34:35], s[0:3], 0 addr64 offset:12
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
 ; LOOP-NEXT:    v_lshrrev_b32_e32 v7, 24, v7
-; LOOP-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
-; LOOP-NEXT:    v_bfe_u32 v15, v10, 8, 8
-; LOOP-NEXT:    buffer_store_byte v10, v[8:9], s[0:3], 0 addr64 offset:4
+; LOOP-NEXT:    v_lshrrev_b32_e32 v22, 16, v8
+; LOOP-NEXT:    v_bfe_u32 v23, v8, 8, 8
+; LOOP-NEXT:    buffer_store_byte v8, v[34:35], s[0:3], 0 addr64 offset:16
+; LOOP-NEXT:    s_waitcnt expcnt(0)
+; LOOP-NEXT:    v_lshrrev_b32_e32 v8, 24, v8
+; LOOP-NEXT:    v_lshrrev_b32_e32 v24, 16, v9
+; LOOP-NEXT:    v_bfe_u32 v25, v9, 8, 8
+; LOOP-NEXT:    buffer_store_byte v9, v[34:35], s[0:3], 0 addr64 offset:20
+; LOOP-NEXT:    s_waitcnt expcnt(0)
+; LOOP-NEXT:    v_lshrrev_b32_e32 v9, 24, v9
+; LOOP-NEXT:    v_lshrrev_b32_e32 v26, 16, v10
+; LOOP-NEXT:    v_bfe_u32 v27, v10, 8, 8
+; LOOP-NEXT:    buffer_store_byte v10, v[34:35], s[0:3], 0 addr64 offset:24
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
 ; LOOP-NEXT:    v_lshrrev_b32_e32 v10, 24, v10
-; LOOP-NEXT:    v_lshrrev_b32_e32 v16, 16, v11
-; LOOP-NEXT:    v_bfe_u32 v17, v11, 8, 8
-; LOOP-NEXT:    buffer_store_byte v11, v[8:9], s[0:3], 0 addr64 offset:8
+; LOOP-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
+; LOOP-NEXT:    v_bfe_u32 v29, v11, 8, 8
+; LOOP-NEXT:    buffer_store_byte v11, v[34:35], s[0:3], 0 addr64 offset:28
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
 ; LOOP-NEXT:    v_lshrrev_b32_e32 v11, 24, v11
-; LOOP-NEXT:    v_lshrrev_b32_e32 v18, 16, v6
-; LOOP-NEXT:    v_bfe_u32 v19, v6, 8, 8
-; LOOP-NEXT:    buffer_store_byte v6, v[8:9], s[0:3], 0 addr64 offset:12
-; LOOP-NEXT:    s_waitcnt expcnt(0)
-; LOOP-NEXT:    v_lshrrev_b32_e32 v6, 24, v6
-; LOOP-NEXT:    buffer_store_byte v13, v[8:9], s[0:3], 0 addr64 offset:1
-; LOOP-NEXT:    buffer_store_byte v12, v[8:9], s[0:3], 0 addr64 offset:2
-; LOOP-NEXT:    buffer_store_byte v7, v[8:9], s[0:3], 0 addr64 offset:3
-; LOOP-NEXT:    buffer_store_byte v15, v[8:9], s[0:3], 0 addr64 offset:5
-; LOOP-NEXT:    buffer_store_byte v14, v[8:9], s[0:3], 0 addr64 offset:6
-; LOOP-NEXT:    buffer_store_byte v10, v[8:9], s[0:3], 0 addr64 offset:7
-; LOOP-NEXT:    buffer_store_byte v17, v[8:9], s[0:3], 0 addr64 offset:9
-; LOOP-NEXT:    buffer_store_byte v16, v[8:9], s[0:3], 0 addr64 offset:10
-; LOOP-NEXT:    buffer_store_byte v11, v[8:9], s[0:3], 0 addr64 offset:11
-; LOOP-NEXT:    buffer_store_byte v19, v[8:9], s[0:3], 0 addr64 offset:13
-; LOOP-NEXT:    buffer_store_byte v18, v[8:9], s[0:3], 0 addr64 offset:14
-; LOOP-NEXT:    buffer_store_byte v6, v[8:9], s[0:3], 0 addr64 offset:15
+; LOOP-NEXT:    buffer_store_byte v13, v[34:35], s[0:3], 0 addr64 offset:1
+; LOOP-NEXT:    buffer_store_byte v12, v[34:35], s[0:3], 0 addr64 offset:2
+; LOOP-NEXT:    buffer_store_byte v14, v[34:35], s[0:3], 0 addr64 offset:3
+; LOOP-NEXT:    buffer_store_byte v16, v[34:35], s[0:3], 0 addr64 offset:5
+; LOOP-NEXT:    buffer_store_byte v15, v[34:35], s[0:3], 0 addr64 offset:6
+; LOOP-NEXT:    buffer_store_byte v17, v[34:35], s[0:3], 0 addr64 offset:7
+; LOOP-NEXT:    buffer_store_byte v19, v[34:35], s[0:3], 0 addr64 offset:9
+; LOOP-NEXT:    buffer_store_byte v18, v[34:35], s[0:3], 0 addr64 offset:10
+; LOOP-NEXT:    buffer_store_byte v6, v[34:35], s[0:3], 0 addr64 offset:11
+; LOOP-NEXT:    buffer_store_byte v21, v[34:35], s[0:3], 0 addr64 offset:13
+; LOOP-NEXT:    buffer_store_byte v20, v[34:35], s[0:3], 0 addr64 offset:14
+; LOOP-NEXT:    buffer_store_byte v7, v[34:35], s[0:3], 0 addr64 offset:15
+; LOOP-NEXT:    buffer_store_byte v23, v[34:35], s[0:3], 0 addr64 offset:17
+; LOOP-NEXT:    buffer_store_byte v22, v[34:35], s[0:3], 0 addr64 offset:18
+; LOOP-NEXT:    buffer_store_byte v8, v[34:35], s[0:3], 0 addr64 offset:19
+; LOOP-NEXT:    buffer_store_byte v25, v[34:35], s[0:3], 0 addr64 offset:21
+; LOOP-NEXT:    buffer_store_byte v24, v[34:35], s[0:3], 0 addr64 offset:22
+; LOOP-NEXT:    buffer_store_byte v9, v[34:35], s[0:3], 0 addr64 offset:23
+; LOOP-NEXT:    buffer_store_byte v27, v[34:35], s[0:3], 0 addr64 offset:25
+; LOOP-NEXT:    buffer_store_byte v26, v[34:35], s[0:3], 0 addr64 offset:26
+; LOOP-NEXT:    buffer_store_byte v10, v[34:35], s[0:3], 0 addr64 offset:27
+; LOOP-NEXT:    buffer_store_byte v29, v[34:35], s[0:3], 0 addr64 offset:29
+; LOOP-NEXT:    buffer_store_byte v28, v[34:35], s[0:3], 0 addr64 offset:30
+; LOOP-NEXT:    buffer_store_byte v11, v[34:35], s[0:3], 0 addr64 offset:31
 ; LOOP-NEXT:    s_cbranch_vccnz .LBB0_1
 ; LOOP-NEXT:  ; %bb.2: ; %memcpy-split
 ; LOOP-NEXT:    s_mov_b32 s2, 0
 ; LOOP-NEXT:    s_mov_b32 s3, 0xf000
 ; LOOP-NEXT:    s_mov_b64 s[0:1], 0
-; LOOP-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:17
-; LOOP-NEXT:    buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64 offset:19
-; LOOP-NEXT:    s_waitcnt expcnt(0)
-; LOOP-NEXT:    buffer_load_ubyte v6, v[2:3], s[0:3], 0 addr64 offset:18
-; LOOP-NEXT:    buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64 offset:16
+; LOOP-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:33
+; LOOP-NEXT:    buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64 offset:35
+; LOOP-NEXT:    buffer_load_ubyte v6, v[2:3], s[0:3], 0 addr64 offset:34
+; LOOP-NEXT:    buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64 offset:32
 ; LOOP-NEXT:    s_waitcnt vmcnt(3)
 ; LOOP-NEXT:    v_lshlrev_b32_e32 v3, 8, v4
 ; LOOP-NEXT:    s_waitcnt vmcnt(2)
@@ -124,12 +196,12 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src)
 ; LOOP-NEXT:    v_or_b32_e32 v2, v3, v2
 ; LOOP-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; LOOP-NEXT:    v_bfe_u32 v4, v2, 8, 8
-; LOOP-NEXT:    buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:16
+; LOOP-NEXT:    buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:32
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
 ; LOOP-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; LOOP-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:17
-; LOOP-NEXT:    buffer_store_byte v3, v[0:1], s[0:3], 0 addr64 offset:18
-; LOOP-NEXT:    buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:19
+; LOOP-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:33
+; LOOP-NEXT:    buffer_store_byte v3, v[0:1], s[0:3], 0 addr64 offset:34
+; LOOP-NEXT:    buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:35
 ; LOOP-NEXT:    s_endpgm
 ;
 ; UNROLL-LABEL: memcpy_p1i8:
@@ -212,11 +284,75 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src)
 ; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:18
 ; UNROLL-NEXT:    s_waitcnt vmcnt(0)
 ; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:18
-; UNROLL-NEXT:    buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64 offset:19
+; UNROLL-NEXT:    s_waitcnt expcnt(0)
+; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:19
+; UNROLL-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:19
+; UNROLL-NEXT:    s_waitcnt expcnt(0)
+; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:20
+; UNROLL-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:20
+; UNROLL-NEXT:    s_waitcnt expcnt(0)
+; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:21
+; UNROLL-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:21
+; UNROLL-NEXT:    s_waitcnt expcnt(0)
+; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:22
+; UNROLL-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:22
+; UNROLL-NEXT:    s_waitcnt expcnt(0)
+; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:23
+; UNROLL-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:23
+; UNROLL-NEXT:    s_waitcnt expcnt(0)
+; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:24
+; UNROLL-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:24
+; UNROLL-NEXT:    s_waitcnt expcnt(0)
+; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:25
+; UNROLL-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:25
+; UNROLL-NEXT:    s_waitcnt expcnt(0)
+; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:26
+; UNROLL-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:26
+; UNROLL-NEXT:    s_waitcnt expcnt(0)
+; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:27
+; UNROLL-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:27
+; UNROLL-NEXT:    s_waitcnt expcnt(0)
+; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:28
+; UNROLL-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:28
+; UNROLL-NEXT:    s_waitcnt expcnt(0)
+; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:29
+; UNROLL-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:29
+; UNROLL-NEXT:    s_waitcnt expcnt(0)
+; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:30
+; UNROLL-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:30
+; UNROLL-NEXT:    s_waitcnt expcnt(0)
+; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:31
+; UNROLL-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:31
+; UNROLL-NEXT:    s_waitcnt expcnt(0)
+; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:32
+; UNROLL-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:32
+; UNROLL-NEXT:    s_waitcnt expcnt(0)
+; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:33
+; UNROLL-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:33
+; UNROLL-NEXT:    s_waitcnt expcnt(0)
+; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:34
+; UNROLL-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:34
+; UNROLL-NEXT:    buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64 offset:35
 ; UNROLL-NEXT:    s_waitcnt vmcnt(0)
-; UNROLL-NEXT:    buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:19
+; UNROLL-NEXT:    buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:35
 ; UNROLL-NEXT:    s_endpgm
-  call void @llvm.memcpy.p1.p1.i32(ptr addrspace(1) %dst, ptr addrspace(1) %src, i32 20, i1 false)
+  call void @llvm.memcpy.p1.p1.i32(ptr addrspace(1) %dst, ptr addrspace(1) %src, i32 36, i1 false)
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index a95f22507eece3..ffe9e06c04ae45 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -46,10 +46,10 @@ define amdgpu_kernel void @max_size_small_static_memcpy_caller0(ptr addrspace(1)
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 1
+; ALL-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 1
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; ALL-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; ALL-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; ALL:       memcpy-split:
@@ -66,10 +66,10 @@ define amdgpu_kernel void @min_size_large_static_memcpy_caller0(ptr addrspace(1)
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 1
+; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 1
 ; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
@@ -93,20 +93,20 @@ define amdgpu_kernel void @max_size_small_static_memmove_caller0(ptr addrspace(1
 ; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
 ; ALL:       memmove_bwd_loop:
 ; ALL-NEXT:    [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 1024, [[TMP0:%.*]] ]
-; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP1]], 16
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP1]], 256
 ; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP2]], align 1
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP2]], align 1
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
-; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
+; ALL-NEXT:    store <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
 ; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
 ; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; ALL:       memmove_fwd_loop:
 ; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
 ; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
-; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP5]], align 1
 ; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
-; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7]] = add i64 [[FWD_INDEX]], 16
+; ALL-NEXT:    store <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7]] = add i64 [[FWD_INDEX]], 256
 ; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 1024
 ; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
@@ -128,20 +128,20 @@ define amdgpu_kernel void @min_size_large_static_memmove_caller0(ptr addrspace(1
 ; OPT-NEXT:    br label [[MEMMOVE_BWD_LOOP:%.*]]
 ; OPT:       memmove_bwd_loop:
 ; OPT-NEXT:    [[TMP4:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 1024, [[MEMMOVE_BWD_RESIDUAL]] ]
-; OPT-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP4]], 16
+; OPT-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP4]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
-; OPT-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
+; OPT-NEXT:    [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP5]], align 1
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
-; OPT-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP6]], align 1
+; OPT-NEXT:    store <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP6]], align 1
 ; OPT-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
 ; OPT-NEXT:    br i1 [[TMP7]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; OPT:       memmove_fwd_loop:
 ; OPT-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP10:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0:%.*]] ]
 ; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
-; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP8]], align 1
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP8]], align 1
 ; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
-; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP9]], align 1
-; OPT-NEXT:    [[TMP10]] = add i64 [[FWD_INDEX]], 16
+; OPT-NEXT:    store <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP9]], align 1
+; OPT-NEXT:    [[TMP10]] = add i64 [[FWD_INDEX]], 256
 ; OPT-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[TMP10]], 1024
 ; OPT-NEXT:    br i1 [[TMP11]], label [[MEMMOVE_FWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP]]
 ; OPT:       memmove_fwd_residual:
@@ -421,17 +421,30 @@ define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspac
 ; ALL-NEXT:    [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
 ; ALL-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
 ; ALL:       post-loop-memcpy-expansion:
-; ALL-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; ALL:       load-store-loop:
-; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[POST_LOOP_MEMCPY_EXPANSION]] ], [ [[TMP19:%.*]], [[LOAD_STORE_LOOP]] ]
-; ALL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 0
 ; ALL-NEXT:    [[TMP17:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP16]], align 1
-; ALL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1:%.*]], i64 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1:%.*]], i64 0
 ; ALL-NEXT:    store <4 x i32> [[TMP17]], ptr addrspace(1) [[TMP18]], align 1
-; ALL-NEXT:    [[TMP19]] = add i64 [[LOOP_INDEX]], 16
-; ALL-NEXT:    [[TMP20:%.*]] = icmp ult i64 [[TMP19]], 96
-; ALL-NEXT:    br i1 [[TMP20]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; ALL:       memcpy-split:
+; ALL-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 16
+; ALL-NEXT:    [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP33]], align 1
+; ALL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 16
+; ALL-NEXT:    store <4 x i32> [[TMP19]], ptr addrspace(1) [[TMP20]], align 1
+; ALL-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 32
+; ALL-NEXT:    [[TMP35:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP34]], align 1
+; ALL-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 32
+; ALL-NEXT:    store <4 x i32> [[TMP35]], ptr addrspace(1) [[TMP36]], align 1
+; ALL-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 48
+; ALL-NEXT:    [[TMP38:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP37]], align 1
+; ALL-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 48
+; ALL-NEXT:    store <4 x i32> [[TMP38]], ptr addrspace(1) [[TMP39]], align 1
+; ALL-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 64
+; ALL-NEXT:    [[TMP28:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP40]], align 1
+; ALL-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 64
+; ALL-NEXT:    store <4 x i32> [[TMP28]], ptr addrspace(1) [[TMP29]], align 1
+; ALL-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 80
+; ALL-NEXT:    [[TMP31:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP30]], align 1
+; ALL-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 80
+; ALL-NEXT:    store <4 x i32> [[TMP31]], ptr addrspace(1) [[TMP32]], align 1
 ; ALL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 96
 ; ALL-NEXT:    [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 1
 ; ALL-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 96
@@ -456,10 +469,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1028(ptr addrspace
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
+; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
 ; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
@@ -479,10 +492,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1025(ptr addrspace
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
+; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
 ; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
@@ -502,10 +515,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1026(ptr addrspace
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
+; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
 ; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
@@ -525,10 +538,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1032(ptr addrspace
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
+; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
 ; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
@@ -548,10 +561,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1034(ptr addrspace
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
+; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
 ; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
@@ -575,10 +588,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1035(ptr addrspace
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
+; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
 ; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
@@ -606,10 +619,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1036(ptr addrspace
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
+; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
 ; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
@@ -633,10 +646,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1039(ptr addrspace
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
+; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
 ; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
@@ -691,10 +704,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1027(ptr addrspace
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
+; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
 ; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
@@ -764,10 +777,10 @@ define amdgpu_kernel void @memcpy_private_align4_private_align4_1027(ptr addrspa
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP1]], align 4
+; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 4
 ; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 4
-; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 16
+; OPT-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 4
+; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
@@ -814,10 +827,10 @@ define amdgpu_kernel void @memcpy_private_align1_private_align4_1027(ptr addrspa
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP1]], align 4
+; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 4
 ; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1
-; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 16
+; OPT-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1
+; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
@@ -864,10 +877,10 @@ define amdgpu_kernel void @memcpy_private_align4_private_align1_1027(ptr addrspa
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP1]], align 1
+; OPT-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 1
 ; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 4
-; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 16
+; OPT-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 4
+; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
@@ -1194,17 +1207,10 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_16(ptr addrspace(1
 ; MAX1024-NEXT:    ret void
 ;
 ; ALL-LABEL: @memcpy_global_align4_global_align4_16(
-; ALL-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
-; ALL:       load-store-loop:
-; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0
 ; ALL-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
-; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
-; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 16
-; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
-; ALL:       memcpy-split:
 ; ALL-NEXT:    ret void
 ;
   call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 16, i1 false)
@@ -1326,20 +1332,20 @@ define amdgpu_kernel void @memmove_flat_align1_global_align1(ptr %dst, ptr addrs
 ; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
 ; ALL:       memmove_bwd_loop:
 ; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
-; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 16
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 256
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP3]], align 1
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP3]], align 1
 ; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[BWD_INDEX]]
-; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
+; ALL-NEXT:    store <64 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
 ; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; ALL:       memmove_fwd_loop:
 ; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
 ; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
-; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP6]], align 1
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP6]], align 1
 ; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[FWD_INDEX]]
-; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
-; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 16
+; ALL-NEXT:    store <64 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
+; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 256
 ; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 256
 ; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
@@ -1360,20 +1366,20 @@ define amdgpu_kernel void @memmove_global_align1_flat_align1(ptr addrspace(1) %d
 ; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
 ; ALL:       memmove_bwd_loop:
 ; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
-; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 16
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 256
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[BWD_INDEX]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr [[TMP3]], align 1
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <64 x i32>, ptr [[TMP3]], align 1
 ; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
-; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP4]], align 1
+; ALL-NEXT:    store <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP4]], align 1
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
 ; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; ALL:       memmove_fwd_loop:
 ; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
 ; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[FWD_INDEX]]
-; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr [[TMP6]], align 1
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <64 x i32>, ptr [[TMP6]], align 1
 ; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
-; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP7]], align 1
-; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 16
+; ALL-NEXT:    store <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP7]], align 1
+; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 256
 ; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 256
 ; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
@@ -1394,20 +1400,20 @@ define amdgpu_kernel void @memmove_flat_align1_private_align1(ptr %dst, ptr addr
 ; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
 ; ALL:       memmove_bwd_loop:
 ; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
-; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 16
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 256
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i64 [[BWD_INDEX]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP3]], align 1
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP3]], align 1
 ; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[BWD_INDEX]]
-; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
+; ALL-NEXT:    store <64 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
 ; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; ALL:       memmove_fwd_loop:
 ; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
 ; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i64 [[FWD_INDEX]]
-; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP6]], align 1
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP6]], align 1
 ; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[FWD_INDEX]]
-; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
-; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 16
+; ALL-NEXT:    store <64 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
+; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 256
 ; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 256
 ; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
@@ -1428,20 +1434,20 @@ define amdgpu_kernel void @memmove_private_align1_flat_align1(ptr addrspace(5) %
 ; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
 ; ALL:       memmove_bwd_loop:
 ; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
-; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 16
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 256
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[BWD_INDEX]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr [[TMP3]], align 1
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <64 x i32>, ptr [[TMP3]], align 1
 ; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i64 [[BWD_INDEX]]
-; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(5) [[TMP4]], align 1
+; ALL-NEXT:    store <64 x i32> [[ELEMENT]], ptr addrspace(5) [[TMP4]], align 1
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
 ; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; ALL:       memmove_fwd_loop:
 ; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
 ; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[FWD_INDEX]]
-; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr [[TMP6]], align 1
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <64 x i32>, ptr [[TMP6]], align 1
 ; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i64 [[FWD_INDEX]]
-; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP7]], align 1
-; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 16
+; ALL-NEXT:    store <64 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP7]], align 1
+; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 256
 ; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 256
 ; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
@@ -1461,10 +1467,10 @@ define amdgpu_kernel void @memmove_private_align1_global_align1(ptr addrspace(5)
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 1, !alias.scope [[META0:![0-9]+]]
+; ALL-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 1, !alias.scope [[META0:![0-9]+]]
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; ALL-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META0]]
-; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; ALL-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META0]]
+; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 256
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; ALL:       memcpy-split:
@@ -1484,10 +1490,10 @@ define amdgpu_kernel void @memmove_global_align1_private_align1(ptr addrspace(1)
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]]
+; ALL-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]]
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; ALL-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1, !noalias [[META3]]
-; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; ALL-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1, !noalias [[META3]]
+; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 256
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; ALL:       memcpy-split:
@@ -2144,20 +2150,20 @@ define amdgpu_kernel void @memmove_private_align1_private_align1(ptr addrspace(5
 ; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
 ; ALL:       memmove_bwd_loop:
 ; ALL-NEXT:    [[TMP1:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
-; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP1]], 16
+; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP1]], 256
 ; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[BWD_INDEX]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP2]], align 1
+; ALL-NEXT:    [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP2]], align 1
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[BWD_INDEX]]
-; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(5) [[TMP3]], align 1
+; ALL-NEXT:    store <64 x i32> [[ELEMENT]], ptr addrspace(5) [[TMP3]], align 1
 ; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
 ; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; ALL:       memmove_fwd_loop:
 ; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i32 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
 ; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[FWD_INDEX]]
-; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP5]], align 1
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP5]], align 1
 ; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[FWD_INDEX]]
-; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7]] = add i32 [[FWD_INDEX]], 16
+; ALL-NEXT:    store <64 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7]] = add i32 [[FWD_INDEX]], 256
 ; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 256
 ; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
@@ -2231,27 +2237,27 @@ define amdgpu_kernel void @memmove_global_align4_static_residual_empty(ptr addrs
 ; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
 ; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
 ; OPT:       memmove_bwd_loop:
-; OPT-NEXT:    [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 1040, [[TMP0:%.*]] ]
-; OPT-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP1]], 16
+; OPT-NEXT:    [[TMP11:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 1280, [[TMP0:%.*]] ]
+; OPT-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP11]], 256
 ; OPT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
-; OPT-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP2]], align 1
+; OPT-NEXT:    [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP2]], align 1
 ; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
-; OPT-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
+; OPT-NEXT:    store <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; OPT:       memmove_fwd_loop:
 ; OPT-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
 ; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
-; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP5]], align 1
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
-; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1
-; OPT-NEXT:    [[TMP7]] = add i64 [[FWD_INDEX]], 16
-; OPT-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 1040
+; OPT-NEXT:    store <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1
+; OPT-NEXT:    [[TMP7]] = add i64 [[FWD_INDEX]], 256
+; OPT-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 1280
 ; OPT-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; OPT:       memmove_done:
 ; OPT-NEXT:    ret void
 ;
-  call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1040, i1 false)
+  call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1280, i1 false)
   ret void
 }
 
@@ -2279,20 +2285,20 @@ define amdgpu_kernel void @memmove_global_align4_static_residual_full(ptr addrsp
 ; OPT-NEXT:    br label [[MEMMOVE_BWD_LOOP:%.*]]
 ; OPT:       memmove_bwd_loop:
 ; OPT-NEXT:    [[TMP13:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 1024, [[MEMMOVE_BWD_RESIDUAL]] ]
-; OPT-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP13]], 16
+; OPT-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP13]], 256
 ; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
-; OPT-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP14]], align 1
+; OPT-NEXT:    [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP14]], align 1
 ; OPT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
-; OPT-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP15]], align 1
+; OPT-NEXT:    store <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP15]], align 1
 ; OPT-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
 ; OPT-NEXT:    br i1 [[TMP16]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; OPT:       memmove_fwd_loop:
 ; OPT-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP19:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0:%.*]] ]
 ; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
-; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP17]], align 1
+; OPT-NEXT:    [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP17]], align 1
 ; OPT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
-; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP18]], align 1
-; OPT-NEXT:    [[TMP19]] = add i64 [[FWD_INDEX]], 16
+; OPT-NEXT:    store <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP18]], align 1
+; OPT-NEXT:    [[TMP19]] = add i64 [[FWD_INDEX]], 256
 ; OPT-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 1024
 ; OPT-NEXT:    br i1 [[TMP20]], label [[MEMMOVE_FWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP]]
 ; OPT:       memmove_fwd_residual:
@@ -2363,40 +2369,40 @@ entry:
 
 define amdgpu_kernel void @memmove_volatile(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
 ; MAX1024-LABEL: @memmove_volatile(
-; MAX1024-NEXT:    call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 64, i1 true)
+; MAX1024-NEXT:    call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 512, i1 true)
 ; MAX1024-NEXT:    ret void
 ;
 ; ALL-LABEL: @memmove_volatile(
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
 ; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
 ; ALL:       memmove_bwd_loop:
-; ALL-NEXT:    [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 64, [[TMP0:%.*]] ]
-; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP1]], 16
+; ALL-NEXT:    [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 512, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP1]], 256
 ; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
-; ALL-NEXT:    [[ELEMENT:%.*]] = load volatile <4 x i32>, ptr addrspace(1) [[TMP2]], align 1
+; ALL-NEXT:    [[ELEMENT:%.*]] = load volatile <64 x i32>, ptr addrspace(1) [[TMP2]], align 1
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
-; ALL-NEXT:    store volatile <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
+; ALL-NEXT:    store volatile <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
 ; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
 ; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; ALL:       memmove_fwd_loop:
 ; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
 ; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
-; ALL-NEXT:    [[ELEMENT1:%.*]] = load volatile <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
+; ALL-NEXT:    [[ELEMENT1:%.*]] = load volatile <64 x i32>, ptr addrspace(1) [[TMP5]], align 1
 ; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
-; ALL-NEXT:    store volatile <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7]] = add i64 [[FWD_INDEX]], 16
-; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 64
+; ALL-NEXT:    store volatile <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1
+; ALL-NEXT:    [[TMP7]] = add i64 [[FWD_INDEX]], 256
+; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 512
 ; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
 ;
-  call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 64, i1 true)
+  call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 512, i1 true)
   ret void
 }
 
 define amdgpu_kernel void @memcpy_volatile(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
 ; MAX1024-LABEL: @memcpy_volatile(
-; MAX1024-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 64, i1 true)
+; MAX1024-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 512, i1 true)
 ; MAX1024-NEXT:    ret void
 ;
 ; ALL-LABEL: @memcpy_volatile(
@@ -2404,16 +2410,16 @@ define amdgpu_kernel void @memcpy_volatile(ptr addrspace(1) %dst, ptr addrspace(
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP2:%.*]] = load volatile <4 x i32>, ptr addrspace(1) [[TMP1]], align 1
+; ALL-NEXT:    [[TMP2:%.*]] = load volatile <64 x i32>, ptr addrspace(1) [[TMP1]], align 1
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; ALL-NEXT:    store volatile <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
-; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 64
+; ALL-NEXT:    store volatile <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
+; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
+; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 512
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; ALL:       memcpy-split:
 ; ALL-NEXT:    ret void
 ;
-  call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 64, i1 true)
+  call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 512, i1 true)
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
new file mode 100644
index 00000000000000..565fce0e7abdea
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
@@ -0,0 +1,16049 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 %s -o - | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -mattr=-unaligned-access-mode %s -o - | FileCheck -check-prefix=ALIGNED %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -amdgpu-memcpy-loop-unroll=3 %s -o - | FileCheck -check-prefix=UNROLL3 %s
+
+; For checking that LowerMemIntrinsics lowers memcpy and memmove with large
+; constant copy-sizes into loops with multiple load/store pairs.
+
+
+; memcpy for address spaces 0, 1, 4, 5
+
+define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p0_sz2048:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:  .LBB0_1: ; %load-store-loop
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    v_add_co_u32 v96, vcc_lo, v2, s4
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v100, vcc_lo, v0, s4
+; CHECK-NEXT:    s_add_u32 s4, s4, 0x100
+; CHECK-NEXT:    s_clause 0xf
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[96:97] offset:224
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[96:97] offset:240
+; CHECK-NEXT:    flat_load_dwordx4 v[12:15], v[96:97] offset:192
+; CHECK-NEXT:    flat_load_dwordx4 v[16:19], v[96:97] offset:208
+; CHECK-NEXT:    flat_load_dwordx4 v[20:23], v[96:97] offset:160
+; CHECK-NEXT:    flat_load_dwordx4 v[24:27], v[96:97] offset:176
+; CHECK-NEXT:    flat_load_dwordx4 v[28:31], v[96:97] offset:128
+; CHECK-NEXT:    flat_load_dwordx4 v[32:35], v[96:97] offset:144
+; CHECK-NEXT:    flat_load_dwordx4 v[36:39], v[96:97] offset:96
+; CHECK-NEXT:    flat_load_dwordx4 v[48:51], v[96:97] offset:112
+; CHECK-NEXT:    flat_load_dwordx4 v[52:55], v[96:97] offset:64
+; CHECK-NEXT:    flat_load_dwordx4 v[64:67], v[96:97] offset:80
+; CHECK-NEXT:    flat_load_dwordx4 v[68:71], v[96:97] offset:32
+; CHECK-NEXT:    flat_load_dwordx4 v[80:83], v[96:97] offset:48
+; CHECK-NEXT:    flat_load_dwordx4 v[84:87], v[96:97]
+; CHECK-NEXT:    flat_load_dwordx4 v[96:99], v[96:97] offset:16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT:    s_addc_u32 s5, s5, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[4:7] offset:224
+; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[8:11] offset:240
+; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[12:15] offset:192
+; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[16:19] offset:208
+; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[20:23] offset:160
+; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[24:27] offset:176
+; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[28:31] offset:128
+; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[32:35] offset:144
+; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[36:39] offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[48:51] offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[52:55] offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[64:67] offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[68:71] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[80:83] offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[84:87]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[96:99] offset:16
+; CHECK-NEXT:    v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
+; CHECK-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
+; CHECK-NEXT:    s_cbranch_vccnz .LBB0_1
+; CHECK-NEXT:  ; %bb.2: ; %memcpy-split
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; ALIGNED-LABEL: memcpy_p0_p0_sz2048:
+; ALIGNED:       ; %bb.0: ; %entry
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT:    s_mov_b64 s[4:5], 0
+; ALIGNED-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; ALIGNED-NEXT:  .LBB0_1: ; %load-store-loop
+; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ALIGNED-NEXT:    v_add_co_u32 v24, vcc_lo, v2, s4
+; ALIGNED-NEXT:    v_add_co_ci_u32_e32 v25, vcc_lo, s5, v3, vcc_lo
+; ALIGNED-NEXT:    s_clause 0xf
+; ALIGNED-NEXT:    flat_load_dwordx4 v[16:19], v[24:25] offset:240
+; ALIGNED-NEXT:    flat_load_dwordx4 v[20:23], v[24:25] offset:224
+; ALIGNED-NEXT:    flat_load_dwordx4 v[4:7], v[24:25]
+; ALIGNED-NEXT:    flat_load_dwordx4 v[8:11], v[24:25] offset:16
+; ALIGNED-NEXT:    flat_load_dwordx4 v[12:15], v[24:25] offset:32
+; ALIGNED-NEXT:    flat_load_dwordx4 v[112:115], v[24:25] offset:48
+; ALIGNED-NEXT:    flat_load_dwordx4 v[116:119], v[24:25] offset:64
+; ALIGNED-NEXT:    flat_load_dwordx4 v[40:43], v[24:25] offset:80
+; ALIGNED-NEXT:    flat_load_dwordx4 v[26:29], v[24:25] offset:96
+; ALIGNED-NEXT:    flat_load_dwordx4 v[32:35], v[24:25] offset:112
+; ALIGNED-NEXT:    flat_load_dwordx4 v[44:47], v[24:25] offset:128
+; ALIGNED-NEXT:    flat_load_dwordx4 v[52:55], v[24:25] offset:144
+; ALIGNED-NEXT:    flat_load_dwordx4 v[66:69], v[24:25] offset:160
+; ALIGNED-NEXT:    flat_load_dwordx4 v[81:84], v[24:25] offset:176
+; ALIGNED-NEXT:    flat_load_dwordx4 v[96:99], v[24:25] offset:192
+; ALIGNED-NEXT:    flat_load_dwordx4 v[100:103], v[24:25] offset:208
+; ALIGNED-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
+; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:64
+; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:68
+; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:72
+; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:76
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
+; ALIGNED-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:72
+; ALIGNED-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:68
+; ALIGNED-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:64
+; ALIGNED-NEXT:    v_add_co_u32 v16, vcc_lo, v0, s4
+; ALIGNED-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; ALIGNED-NEXT:    s_add_u32 s4, s4, 0x100
+; ALIGNED-NEXT:    s_addc_u32 s5, s5, 0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v31 offset:254
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v31 offset:252
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v30 offset:250
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v30 offset:248
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v25 offset:246
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v25 offset:244
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v24 offset:242
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v24 offset:240
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(22)
+; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:80
+; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:84
+; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:88
+; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:92
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:92
+; ALIGNED-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:88
+; ALIGNED-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:84
+; ALIGNED-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:80
+; ALIGNED-NEXT:    v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v51 offset:238
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v51 offset:236
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v50 offset:234
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v50 offset:232
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v49 offset:230
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v49 offset:228
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v36 offset:226
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v36 offset:224
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(16)
+; ALIGNED-NEXT:    buffer_store_dword v100, off, s[0:3], s32 offset:32
+; ALIGNED-NEXT:    buffer_store_dword v101, off, s[0:3], s32 offset:36
+; ALIGNED-NEXT:    buffer_store_dword v102, off, s[0:3], s32 offset:40
+; ALIGNED-NEXT:    buffer_store_dword v103, off, s[0:3], s32 offset:44
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v71, off, s[0:3], s32 offset:44
+; ALIGNED-NEXT:    buffer_load_dword v70, off, s[0:3], s32 offset:40
+; ALIGNED-NEXT:    buffer_load_dword v65, off, s[0:3], s32 offset:36
+; ALIGNED-NEXT:    buffer_load_dword v64, off, s[0:3], s32 offset:32
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v31
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v31, 8, v31
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v30
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v30, 8, v30
+; ALIGNED-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v71 offset:222
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v71 offset:220
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v70 offset:218
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v70 offset:216
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v65 offset:214
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v65 offset:212
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v64 offset:210
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v64 offset:208
+; ALIGNED-NEXT:    buffer_store_dword v96, off, s[0:3], s32 offset:48
+; ALIGNED-NEXT:    buffer_store_dword v97, off, s[0:3], s32 offset:52
+; ALIGNED-NEXT:    buffer_store_dword v98, off, s[0:3], s32 offset:56
+; ALIGNED-NEXT:    buffer_store_dword v99, off, s[0:3], s32 offset:60
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v87, off, s[0:3], s32 offset:60
+; ALIGNED-NEXT:    buffer_load_dword v86, off, s[0:3], s32 offset:56
+; ALIGNED-NEXT:    buffer_load_dword v85, off, s[0:3], s32 offset:52
+; ALIGNED-NEXT:    buffer_load_dword v80, off, s[0:3], s32 offset:48
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v87 offset:206
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v87 offset:204
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v86 offset:202
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v86 offset:200
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v85 offset:198
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v85 offset:196
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v80 offset:194
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v80 offset:192
+; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:128
+; ALIGNED-NEXT:    buffer_store_dword v82, off, s[0:3], s32 offset:132
+; ALIGNED-NEXT:    buffer_store_dword v83, off, s[0:3], s32 offset:136
+; ALIGNED-NEXT:    buffer_store_dword v84, off, s[0:3], s32 offset:140
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v101, off, s[0:3], s32 offset:140
+; ALIGNED-NEXT:    buffer_load_dword v99, off, s[0:3], s32 offset:136
+; ALIGNED-NEXT:    buffer_load_dword v96, off, s[0:3], s32 offset:132
+; ALIGNED-NEXT:    buffer_load_dword v81, off, s[0:3], s32 offset:128
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v101 offset:190
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v101 offset:188
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v99 offset:186
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v99 offset:184
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v96 offset:182
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v96 offset:180
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v81 offset:178
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v81 offset:176
+; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:144
+; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:148
+; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:152
+; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:156
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v100, off, s[0:3], s32 offset:156
+; ALIGNED-NEXT:    buffer_load_dword v97, off, s[0:3], s32 offset:152
+; ALIGNED-NEXT:    buffer_load_dword v82, off, s[0:3], s32 offset:148
+; ALIGNED-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:144
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v100 offset:174
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v100 offset:172
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v97 offset:170
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v97 offset:168
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v82 offset:166
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v82 offset:164
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v66 offset:162
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v66 offset:160
+; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:96
+; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:100
+; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:104
+; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:108
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v98, off, s[0:3], s32 offset:108
+; ALIGNED-NEXT:    buffer_load_dword v83, off, s[0:3], s32 offset:104
+; ALIGNED-NEXT:    buffer_load_dword v67, off, s[0:3], s32 offset:100
+; ALIGNED-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:96
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v98 offset:158
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v98 offset:156
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v83 offset:154
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v83 offset:152
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v67 offset:150
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v67 offset:148
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v52 offset:146
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v52 offset:144
+; ALIGNED-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:112
+; ALIGNED-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:116
+; ALIGNED-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:120
+; ALIGNED-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:124
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v84, off, s[0:3], s32 offset:124
+; ALIGNED-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:120
+; ALIGNED-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:116
+; ALIGNED-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:112
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v84 offset:142
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v84 offset:140
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v68 offset:138
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v68 offset:136
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v53 offset:134
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v53 offset:132
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v37 offset:130
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v37 offset:128
+; ALIGNED-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196
+; ALIGNED-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:204
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v69, off, s[0:3], s32 offset:204
+; ALIGNED-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:196
+; ALIGNED-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v69 offset:126
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v69 offset:124
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v54 offset:122
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v54 offset:120
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v38 offset:118
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v38 offset:116
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v32 offset:114
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v32 offset:112
+; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:212
+; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:216
+; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:220
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:220
+; ALIGNED-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:216
+; ALIGNED-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:212
+; ALIGNED-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v55 offset:110
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v55 offset:108
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v39 offset:106
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v39 offset:104
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v33 offset:102
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v33 offset:100
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v26 offset:98
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v26 offset:96
+; ALIGNED-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:160
+; ALIGNED-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:164
+; ALIGNED-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:168
+; ALIGNED-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:172
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:172
+; ALIGNED-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:168
+; ALIGNED-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:164
+; ALIGNED-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:160
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v48 offset:94
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v48 offset:92
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v34 offset:90
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v34 offset:88
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v27 offset:86
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v27 offset:84
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v21 offset:82
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v21 offset:80
+; ALIGNED-NEXT:    buffer_store_dword v116, off, s[0:3], s32 offset:176
+; ALIGNED-NEXT:    buffer_store_dword v117, off, s[0:3], s32 offset:180
+; ALIGNED-NEXT:    buffer_store_dword v118, off, s[0:3], s32 offset:184
+; ALIGNED-NEXT:    buffer_store_dword v119, off, s[0:3], s32 offset:188
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:188
+; ALIGNED-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:184
+; ALIGNED-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:180
+; ALIGNED-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:176
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v35 offset:78
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v35 offset:76
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v28 offset:74
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v28 offset:72
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v22 offset:70
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v22 offset:68
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v19 offset:66
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v19 offset:64
+; ALIGNED-NEXT:    buffer_store_dword v112, off, s[0:3], s32 offset:256
+; ALIGNED-NEXT:    buffer_store_dword v113, off, s[0:3], s32 offset:260
+; ALIGNED-NEXT:    buffer_store_dword v114, off, s[0:3], s32 offset:264
+; ALIGNED-NEXT:    buffer_store_dword v115, off, s[0:3], s32 offset:268
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:268
+; ALIGNED-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:264
+; ALIGNED-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:260
+; ALIGNED-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:256
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 24, v25
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v25, 8, v25
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v24
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v24, 8, v24
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v51
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 8, v51
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 24, v50
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v50, 8, v50
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v29 offset:62
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v29 offset:60
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v23 offset:58
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v23 offset:56
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v20 offset:54
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v20 offset:52
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v18 offset:50
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v18 offset:48
+; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:272
+; ALIGNED-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:280
+; ALIGNED-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:284
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:280
+; ALIGNED-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:284
+; ALIGNED-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:272
+; ALIGNED-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v15 offset:42
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v15 offset:40
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v14 offset:46
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v14 offset:44
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v13 offset:34
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v13 offset:32
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v12 offset:38
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v12 offset:36
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224
+; ALIGNED-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:228
+; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:232
+; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:236
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:236
+; ALIGNED-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:232
+; ALIGNED-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:228
+; ALIGNED-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:224
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v11 offset:30
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v11 offset:28
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v10 offset:26
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v10 offset:24
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v9 offset:22
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v9 offset:20
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v8 offset:18
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v8 offset:16
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:240
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:244
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:252
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:252
+; ALIGNED-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:244
+; ALIGNED-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:240
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v112 offset:247
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 24, v65
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v102 offset:255
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v49
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v49, 8, v49
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v31 offset:253
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v31, 24, v36
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v36, 8, v36
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v103 offset:251
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v71
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 8, v71
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v30 offset:249
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v30, 24, v70
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 8, v70
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v65, 8, v65
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v25 offset:245
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v25, 24, v64
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v64, 8, v64
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v112 offset:215
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 24, v67
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v67, 8, v67
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v113 offset:243
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v87
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v87, 8, v87
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v24 offset:241
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v24, 24, v86
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 8, v86
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v114 offset:239
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v85
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v85, 8, v85
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v51 offset:237
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 24, v80
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v80, 8, v80
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v115 offset:235
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 24, v101
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 8, v101
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v50 offset:233
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v50, 24, v99
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 8, v99
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v102 offset:231
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v96
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v96, 8, v96
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v49 offset:229
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v49, 24, v81
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v81, 8, v81
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v31 offset:227
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v31, 24, v100
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 8, v100
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v36 offset:225
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v36, 24, v97
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v97, 8, v97
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v103 offset:223
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v82
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 8, v82
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v71 offset:221
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 24, v66
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v66, 8, v66
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v30 offset:219
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v30, 24, v98
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 8, v98
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v70 offset:217
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 24, v83
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v83, 8, v83
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v65 offset:213
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v65, 24, v52
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v52, 8, v52
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v25 offset:211
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v25, 24, v84
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v84, 8, v84
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v64 offset:209
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v64, 24, v68
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v67 offset:149
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v67, 24, v8
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v8, 8, v8
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v68, 8, v68
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v113 offset:207
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v53
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v53, 8, v53
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v87 offset:205
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v87, 24, v37
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v37, 8, v37
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v24 offset:203
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v24, 24, v69
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v69, 8, v69
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v86 offset:201
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v54
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v54, 8, v54
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v114 offset:199
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v38
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v38, 8, v38
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v85 offset:197
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v85, 24, v32
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v32, 8, v32
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v51 offset:195
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 24, v55
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v55, 8, v55
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v80 offset:193
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v80, 24, v39
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v39, 8, v39
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v115 offset:191
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 24, v33
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v33, 8, v33
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v101 offset:189
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 24, v26
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v26, 8, v26
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v50 offset:187
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v50, 24, v48
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v48, 8, v48
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v99 offset:185
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 24, v34
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v34, 8, v34
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v102 offset:183
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v27
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v27, 8, v27
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v96 offset:181
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v96, 24, v21
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v21, 8, v21
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v49 offset:179
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v49, 24, v35
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v35, 8, v35
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v81 offset:177
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v81, 24, v28
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v28, 8, v28
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v31 offset:175
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v31, 24, v22
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v22, 8, v22
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v100 offset:173
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 24, v19
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v19, 8, v19
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v36 offset:171
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v36, 24, v29
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v29, 8, v29
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v97 offset:169
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v97, 24, v23
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v23, 8, v23
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v103 offset:167
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v20
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v20, 8, v20
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v82 offset:165
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v18
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v18, 8, v18
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v71 offset:163
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 24, v15
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v15, 8, v15
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v66 offset:161
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v66, 24, v14
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v14, 8, v14
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v30 offset:159
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v30, 24, v13
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v13, 8, v13
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v98 offset:157
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 24, v12
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v12, 8, v12
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v70 offset:155
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 24, v11
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v11, 8, v11
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v83 offset:153
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v83, 24, v10
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v10, 8, v10
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v112 offset:151
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 24, v9
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v9, 8, v9
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v65 offset:147
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v52 offset:145
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v25 offset:143
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v84 offset:141
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v64 offset:139
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v68 offset:137
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v113 offset:135
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v53 offset:133
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v87 offset:131
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v37 offset:129
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v24 offset:127
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v69 offset:125
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v86 offset:123
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v54 offset:121
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v114 offset:119
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v38 offset:117
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v85 offset:115
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v32 offset:113
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v51 offset:111
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v55 offset:109
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v80 offset:107
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v39 offset:105
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v115 offset:103
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v33 offset:101
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v101 offset:99
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v26 offset:97
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v50 offset:95
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v48 offset:93
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v99 offset:91
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v34 offset:89
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v102 offset:87
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v27 offset:85
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v96 offset:83
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v21 offset:81
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v49 offset:79
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v35 offset:77
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v81 offset:75
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v28 offset:73
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v31 offset:71
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v22 offset:69
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v100 offset:67
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v19 offset:65
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v36 offset:63
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v29 offset:61
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v97 offset:59
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v23 offset:57
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v103 offset:55
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v20 offset:53
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v82 offset:51
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v18 offset:49
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v71 offset:43
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v15 offset:41
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v66 offset:47
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v14 offset:45
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v30 offset:35
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v13 offset:33
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v98 offset:39
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v12 offset:37
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v70 offset:31
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v11 offset:29
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v83 offset:27
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v10 offset:25
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v112 offset:23
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v9 offset:21
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v67 offset:19
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v8 offset:17
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v7 offset:14
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v7 offset:12
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v6 offset:10
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v6 offset:8
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v5 offset:6
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v5 offset:4
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v4 offset:2
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v4
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v8, 24, v7
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v9, 24, v6
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v8 offset:15
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v7 offset:13
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v9 offset:11
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v6 offset:9
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v10 offset:7
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v5 offset:5
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v11 offset:3
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v4 offset:1
+; ALIGNED-NEXT:    s_cbranch_vccnz .LBB0_1
+; ALIGNED-NEXT:  ; %bb.2: ; %memcpy-split
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_dword v47, off, s[0:3], s32
+; ALIGNED-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4
+; ALIGNED-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8
+; ALIGNED-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12
+; ALIGNED-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:16
+; ALIGNED-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20
+; ALIGNED-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24
+; ALIGNED-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNROLL3-LABEL: memcpy_p0_p0_sz2048:
+; UNROLL3:       ; %bb.0: ; %entry
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNROLL3-NEXT:    s_mov_b64 s[4:5], 0
+; UNROLL3-NEXT:    .p2align 6
+; UNROLL3-NEXT:  .LBB0_1: ; %load-store-loop
+; UNROLL3-NEXT:    ; =>This Inner Loop Header: Depth=1
+; UNROLL3-NEXT:    v_add_co_u32 v12, vcc_lo, v2, s4
+; UNROLL3-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo
+; UNROLL3-NEXT:    v_add_co_u32 v16, vcc_lo, v0, s4
+; UNROLL3-NEXT:    s_add_u32 s4, s4, 48
+; UNROLL3-NEXT:    s_clause 0x2
+; UNROLL3-NEXT:    flat_load_dwordx4 v[4:7], v[12:13]
+; UNROLL3-NEXT:    flat_load_dwordx4 v[8:11], v[12:13] offset:16
+; UNROLL3-NEXT:    flat_load_dwordx4 v[12:15], v[12:13] offset:32
+; UNROLL3-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; UNROLL3-NEXT:    s_addc_u32 s5, s5, 0
+; UNROLL3-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[4:7]
+; UNROLL3-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(2)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[8:11] offset:16
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(2)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[12:15] offset:32
+; UNROLL3-NEXT:    v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5]
+; UNROLL3-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
+; UNROLL3-NEXT:    s_cbranch_vccnz .LBB0_1
+; UNROLL3-NEXT:  ; %bb.2: ; %memcpy-split
+; UNROLL3-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:2016
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:2016
+; UNROLL3-NEXT:    flat_load_dwordx4 v[2:5], v[2:3] offset:2032
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:2032
+; UNROLL3-NEXT:    s_waitcnt lgkmcnt(0)
+; UNROLL3-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 2048, i1 false)
+  ret void
+}
+
+define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p1_sz2048:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:  .LBB1_1: ; %load-store-loop
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    v_add_co_u32 v96, vcc_lo, v2, s4
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v100, vcc_lo, v0, s4
+; CHECK-NEXT:    s_add_u32 s4, s4, 0x100
+; CHECK-NEXT:    s_clause 0xf
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[96:97], off offset:224
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[96:97], off offset:240
+; CHECK-NEXT:    global_load_dwordx4 v[12:15], v[96:97], off offset:192
+; CHECK-NEXT:    global_load_dwordx4 v[16:19], v[96:97], off offset:208
+; CHECK-NEXT:    global_load_dwordx4 v[20:23], v[96:97], off offset:160
+; CHECK-NEXT:    global_load_dwordx4 v[24:27], v[96:97], off offset:176
+; CHECK-NEXT:    global_load_dwordx4 v[28:31], v[96:97], off offset:128
+; CHECK-NEXT:    global_load_dwordx4 v[32:35], v[96:97], off offset:144
+; CHECK-NEXT:    global_load_dwordx4 v[36:39], v[96:97], off offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[48:51], v[96:97], off offset:112
+; CHECK-NEXT:    global_load_dwordx4 v[52:55], v[96:97], off offset:64
+; CHECK-NEXT:    global_load_dwordx4 v[64:67], v[96:97], off offset:80
+; CHECK-NEXT:    global_load_dwordx4 v[68:71], v[96:97], off offset:32
+; CHECK-NEXT:    global_load_dwordx4 v[80:83], v[96:97], off offset:48
+; CHECK-NEXT:    global_load_dwordx4 v[84:87], v[96:97], off
+; CHECK-NEXT:    global_load_dwordx4 v[96:99], v[96:97], off offset:16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT:    s_addc_u32 s5, s5, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(15)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[4:7], off offset:224
+; CHECK-NEXT:    s_waitcnt vmcnt(14)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[8:11], off offset:240
+; CHECK-NEXT:    s_waitcnt vmcnt(13)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[12:15], off offset:192
+; CHECK-NEXT:    s_waitcnt vmcnt(12)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[16:19], off offset:208
+; CHECK-NEXT:    s_waitcnt vmcnt(11)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[20:23], off offset:160
+; CHECK-NEXT:    s_waitcnt vmcnt(10)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[24:27], off offset:176
+; CHECK-NEXT:    s_waitcnt vmcnt(9)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[28:31], off offset:128
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[32:35], off offset:144
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[36:39], off offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[48:51], off offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[52:55], off offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[64:67], off offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[68:71], off offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[80:83], off offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[84:87], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[96:99], off offset:16
+; CHECK-NEXT:    v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
+; CHECK-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
+; CHECK-NEXT:    s_cbranch_vccnz .LBB1_1
+; CHECK-NEXT:  ; %bb.2: ; %memcpy-split
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; ALIGNED-LABEL: memcpy_p1_p1_sz2048:
+; ALIGNED:       ; %bb.0: ; %entry
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT:    s_mov_b64 s[4:5], 0
+; ALIGNED-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; ALIGNED-NEXT:  .LBB1_1: ; %load-store-loop
+; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ALIGNED-NEXT:    v_add_co_u32 v24, vcc_lo, v2, s4
+; ALIGNED-NEXT:    v_add_co_ci_u32_e32 v25, vcc_lo, s5, v3, vcc_lo
+; ALIGNED-NEXT:    s_clause 0xf
+; ALIGNED-NEXT:    global_load_dwordx4 v[16:19], v[24:25], off offset:240
+; ALIGNED-NEXT:    global_load_dwordx4 v[20:23], v[24:25], off offset:224
+; ALIGNED-NEXT:    global_load_dwordx4 v[4:7], v[24:25], off
+; ALIGNED-NEXT:    global_load_dwordx4 v[8:11], v[24:25], off offset:16
+; ALIGNED-NEXT:    global_load_dwordx4 v[12:15], v[24:25], off offset:32
+; ALIGNED-NEXT:    global_load_dwordx4 v[112:115], v[24:25], off offset:48
+; ALIGNED-NEXT:    global_load_dwordx4 v[116:119], v[24:25], off offset:64
+; ALIGNED-NEXT:    global_load_dwordx4 v[40:43], v[24:25], off offset:80
+; ALIGNED-NEXT:    global_load_dwordx4 v[26:29], v[24:25], off offset:96
+; ALIGNED-NEXT:    global_load_dwordx4 v[32:35], v[24:25], off offset:112
+; ALIGNED-NEXT:    global_load_dwordx4 v[44:47], v[24:25], off offset:128
+; ALIGNED-NEXT:    global_load_dwordx4 v[52:55], v[24:25], off offset:144
+; ALIGNED-NEXT:    global_load_dwordx4 v[66:69], v[24:25], off offset:160
+; ALIGNED-NEXT:    global_load_dwordx4 v[81:84], v[24:25], off offset:176
+; ALIGNED-NEXT:    global_load_dwordx4 v[96:99], v[24:25], off offset:192
+; ALIGNED-NEXT:    global_load_dwordx4 v[100:103], v[24:25], off offset:208
+; ALIGNED-NEXT:    s_waitcnt vmcnt(15)
+; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:64
+; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:68
+; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:72
+; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:76
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
+; ALIGNED-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:72
+; ALIGNED-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:68
+; ALIGNED-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:64
+; ALIGNED-NEXT:    v_add_co_u32 v16, vcc_lo, v0, s4
+; ALIGNED-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; ALIGNED-NEXT:    s_add_u32 s4, s4, 0x100
+; ALIGNED-NEXT:    s_addc_u32 s5, s5, 0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v31, off offset:254
+; ALIGNED-NEXT:    global_store_byte v[16:17], v31, off offset:252
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v30, off offset:250
+; ALIGNED-NEXT:    global_store_byte v[16:17], v30, off offset:248
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v25, off offset:246
+; ALIGNED-NEXT:    global_store_byte v[16:17], v25, off offset:244
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v24, off offset:242
+; ALIGNED-NEXT:    global_store_byte v[16:17], v24, off offset:240
+; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:80
+; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:84
+; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:88
+; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:92
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:92
+; ALIGNED-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:88
+; ALIGNED-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:84
+; ALIGNED-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:80
+; ALIGNED-NEXT:    v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v51, off offset:238
+; ALIGNED-NEXT:    global_store_byte v[16:17], v51, off offset:236
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v50, off offset:234
+; ALIGNED-NEXT:    global_store_byte v[16:17], v50, off offset:232
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v49, off offset:230
+; ALIGNED-NEXT:    global_store_byte v[16:17], v49, off offset:228
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v36, off offset:226
+; ALIGNED-NEXT:    global_store_byte v[16:17], v36, off offset:224
+; ALIGNED-NEXT:    buffer_store_dword v100, off, s[0:3], s32 offset:32
+; ALIGNED-NEXT:    buffer_store_dword v101, off, s[0:3], s32 offset:36
+; ALIGNED-NEXT:    buffer_store_dword v102, off, s[0:3], s32 offset:40
+; ALIGNED-NEXT:    buffer_store_dword v103, off, s[0:3], s32 offset:44
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v71, off, s[0:3], s32 offset:44
+; ALIGNED-NEXT:    buffer_load_dword v70, off, s[0:3], s32 offset:40
+; ALIGNED-NEXT:    buffer_load_dword v65, off, s[0:3], s32 offset:36
+; ALIGNED-NEXT:    buffer_load_dword v64, off, s[0:3], s32 offset:32
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v31
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v31, 8, v31
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v30
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v30, 8, v30
+; ALIGNED-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v71, off offset:222
+; ALIGNED-NEXT:    global_store_byte v[16:17], v71, off offset:220
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v70, off offset:218
+; ALIGNED-NEXT:    global_store_byte v[16:17], v70, off offset:216
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v65, off offset:214
+; ALIGNED-NEXT:    global_store_byte v[16:17], v65, off offset:212
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v64, off offset:210
+; ALIGNED-NEXT:    global_store_byte v[16:17], v64, off offset:208
+; ALIGNED-NEXT:    buffer_store_dword v96, off, s[0:3], s32 offset:48
+; ALIGNED-NEXT:    buffer_store_dword v97, off, s[0:3], s32 offset:52
+; ALIGNED-NEXT:    buffer_store_dword v98, off, s[0:3], s32 offset:56
+; ALIGNED-NEXT:    buffer_store_dword v99, off, s[0:3], s32 offset:60
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v87, off, s[0:3], s32 offset:60
+; ALIGNED-NEXT:    buffer_load_dword v86, off, s[0:3], s32 offset:56
+; ALIGNED-NEXT:    buffer_load_dword v85, off, s[0:3], s32 offset:52
+; ALIGNED-NEXT:    buffer_load_dword v80, off, s[0:3], s32 offset:48
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v87, off offset:206
+; ALIGNED-NEXT:    global_store_byte v[16:17], v87, off offset:204
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v86, off offset:202
+; ALIGNED-NEXT:    global_store_byte v[16:17], v86, off offset:200
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v85, off offset:198
+; ALIGNED-NEXT:    global_store_byte v[16:17], v85, off offset:196
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v80, off offset:194
+; ALIGNED-NEXT:    global_store_byte v[16:17], v80, off offset:192
+; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:128
+; ALIGNED-NEXT:    buffer_store_dword v82, off, s[0:3], s32 offset:132
+; ALIGNED-NEXT:    buffer_store_dword v83, off, s[0:3], s32 offset:136
+; ALIGNED-NEXT:    buffer_store_dword v84, off, s[0:3], s32 offset:140
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v101, off, s[0:3], s32 offset:140
+; ALIGNED-NEXT:    buffer_load_dword v99, off, s[0:3], s32 offset:136
+; ALIGNED-NEXT:    buffer_load_dword v96, off, s[0:3], s32 offset:132
+; ALIGNED-NEXT:    buffer_load_dword v81, off, s[0:3], s32 offset:128
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v101, off offset:190
+; ALIGNED-NEXT:    global_store_byte v[16:17], v101, off offset:188
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v99, off offset:186
+; ALIGNED-NEXT:    global_store_byte v[16:17], v99, off offset:184
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v96, off offset:182
+; ALIGNED-NEXT:    global_store_byte v[16:17], v96, off offset:180
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v81, off offset:178
+; ALIGNED-NEXT:    global_store_byte v[16:17], v81, off offset:176
+; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:144
+; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:148
+; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:152
+; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:156
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v100, off, s[0:3], s32 offset:156
+; ALIGNED-NEXT:    buffer_load_dword v97, off, s[0:3], s32 offset:152
+; ALIGNED-NEXT:    buffer_load_dword v82, off, s[0:3], s32 offset:148
+; ALIGNED-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:144
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v100, off offset:174
+; ALIGNED-NEXT:    global_store_byte v[16:17], v100, off offset:172
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v97, off offset:170
+; ALIGNED-NEXT:    global_store_byte v[16:17], v97, off offset:168
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v82, off offset:166
+; ALIGNED-NEXT:    global_store_byte v[16:17], v82, off offset:164
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v66, off offset:162
+; ALIGNED-NEXT:    global_store_byte v[16:17], v66, off offset:160
+; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:96
+; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:100
+; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:104
+; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:108
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v98, off, s[0:3], s32 offset:108
+; ALIGNED-NEXT:    buffer_load_dword v83, off, s[0:3], s32 offset:104
+; ALIGNED-NEXT:    buffer_load_dword v67, off, s[0:3], s32 offset:100
+; ALIGNED-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:96
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v98, off offset:158
+; ALIGNED-NEXT:    global_store_byte v[16:17], v98, off offset:156
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v83, off offset:154
+; ALIGNED-NEXT:    global_store_byte v[16:17], v83, off offset:152
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v67, off offset:150
+; ALIGNED-NEXT:    global_store_byte v[16:17], v67, off offset:148
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v52, off offset:146
+; ALIGNED-NEXT:    global_store_byte v[16:17], v52, off offset:144
+; ALIGNED-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:112
+; ALIGNED-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:116
+; ALIGNED-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:120
+; ALIGNED-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:124
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v84, off, s[0:3], s32 offset:124
+; ALIGNED-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:120
+; ALIGNED-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:116
+; ALIGNED-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:112
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v84, off offset:142
+; ALIGNED-NEXT:    global_store_byte v[16:17], v84, off offset:140
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v68, off offset:138
+; ALIGNED-NEXT:    global_store_byte v[16:17], v68, off offset:136
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v53, off offset:134
+; ALIGNED-NEXT:    global_store_byte v[16:17], v53, off offset:132
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v37, off offset:130
+; ALIGNED-NEXT:    global_store_byte v[16:17], v37, off offset:128
+; ALIGNED-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196
+; ALIGNED-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:204
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v69, off, s[0:3], s32 offset:204
+; ALIGNED-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:196
+; ALIGNED-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v69, off offset:126
+; ALIGNED-NEXT:    global_store_byte v[16:17], v69, off offset:124
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v54, off offset:122
+; ALIGNED-NEXT:    global_store_byte v[16:17], v54, off offset:120
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v38, off offset:118
+; ALIGNED-NEXT:    global_store_byte v[16:17], v38, off offset:116
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v32, off offset:114
+; ALIGNED-NEXT:    global_store_byte v[16:17], v32, off offset:112
+; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:212
+; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:216
+; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:220
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:220
+; ALIGNED-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:216
+; ALIGNED-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:212
+; ALIGNED-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v55, off offset:110
+; ALIGNED-NEXT:    global_store_byte v[16:17], v55, off offset:108
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v39, off offset:106
+; ALIGNED-NEXT:    global_store_byte v[16:17], v39, off offset:104
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v33, off offset:102
+; ALIGNED-NEXT:    global_store_byte v[16:17], v33, off offset:100
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v26, off offset:98
+; ALIGNED-NEXT:    global_store_byte v[16:17], v26, off offset:96
+; ALIGNED-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:160
+; ALIGNED-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:164
+; ALIGNED-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:168
+; ALIGNED-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:172
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:172
+; ALIGNED-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:168
+; ALIGNED-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:164
+; ALIGNED-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:160
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v48, off offset:94
+; ALIGNED-NEXT:    global_store_byte v[16:17], v48, off offset:92
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v34, off offset:90
+; ALIGNED-NEXT:    global_store_byte v[16:17], v34, off offset:88
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v27, off offset:86
+; ALIGNED-NEXT:    global_store_byte v[16:17], v27, off offset:84
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v21, off offset:82
+; ALIGNED-NEXT:    global_store_byte v[16:17], v21, off offset:80
+; ALIGNED-NEXT:    buffer_store_dword v116, off, s[0:3], s32 offset:176
+; ALIGNED-NEXT:    buffer_store_dword v117, off, s[0:3], s32 offset:180
+; ALIGNED-NEXT:    buffer_store_dword v118, off, s[0:3], s32 offset:184
+; ALIGNED-NEXT:    buffer_store_dword v119, off, s[0:3], s32 offset:188
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:188
+; ALIGNED-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:184
+; ALIGNED-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:180
+; ALIGNED-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:176
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v35, off offset:78
+; ALIGNED-NEXT:    global_store_byte v[16:17], v35, off offset:76
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v28, off offset:74
+; ALIGNED-NEXT:    global_store_byte v[16:17], v28, off offset:72
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v22, off offset:70
+; ALIGNED-NEXT:    global_store_byte v[16:17], v22, off offset:68
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v19, off offset:66
+; ALIGNED-NEXT:    global_store_byte v[16:17], v19, off offset:64
+; ALIGNED-NEXT:    buffer_store_dword v112, off, s[0:3], s32 offset:256
+; ALIGNED-NEXT:    buffer_store_dword v113, off, s[0:3], s32 offset:260
+; ALIGNED-NEXT:    buffer_store_dword v114, off, s[0:3], s32 offset:264
+; ALIGNED-NEXT:    buffer_store_dword v115, off, s[0:3], s32 offset:268
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:268
+; ALIGNED-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:264
+; ALIGNED-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:260
+; ALIGNED-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:256
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 24, v25
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v25, 8, v25
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v24
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v24, 8, v24
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v51
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 8, v51
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 24, v50
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v50, 8, v50
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v29, off offset:62
+; ALIGNED-NEXT:    global_store_byte v[16:17], v29, off offset:60
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v23, off offset:58
+; ALIGNED-NEXT:    global_store_byte v[16:17], v23, off offset:56
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v20, off offset:54
+; ALIGNED-NEXT:    global_store_byte v[16:17], v20, off offset:52
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v18, off offset:50
+; ALIGNED-NEXT:    global_store_byte v[16:17], v18, off offset:48
+; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:272
+; ALIGNED-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:280
+; ALIGNED-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:284
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:280
+; ALIGNED-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:284
+; ALIGNED-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:272
+; ALIGNED-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v15, off offset:42
+; ALIGNED-NEXT:    global_store_byte v[16:17], v15, off offset:40
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v14, off offset:46
+; ALIGNED-NEXT:    global_store_byte v[16:17], v14, off offset:44
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v13, off offset:34
+; ALIGNED-NEXT:    global_store_byte v[16:17], v13, off offset:32
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v12, off offset:38
+; ALIGNED-NEXT:    global_store_byte v[16:17], v12, off offset:36
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224
+; ALIGNED-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:228
+; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:232
+; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:236
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:236
+; ALIGNED-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:232
+; ALIGNED-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:228
+; ALIGNED-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:224
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v11, off offset:30
+; ALIGNED-NEXT:    global_store_byte v[16:17], v11, off offset:28
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v10, off offset:26
+; ALIGNED-NEXT:    global_store_byte v[16:17], v10, off offset:24
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v9, off offset:22
+; ALIGNED-NEXT:    global_store_byte v[16:17], v9, off offset:20
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v8, off offset:18
+; ALIGNED-NEXT:    global_store_byte v[16:17], v8, off offset:16
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:240
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:244
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:252
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:252
+; ALIGNED-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:244
+; ALIGNED-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:240
+; ALIGNED-NEXT:    global_store_byte v[16:17], v112, off offset:247
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 24, v65
+; ALIGNED-NEXT:    global_store_byte v[16:17], v102, off offset:255
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v49
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v49, 8, v49
+; ALIGNED-NEXT:    global_store_byte v[16:17], v31, off offset:253
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v31, 24, v36
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v36, 8, v36
+; ALIGNED-NEXT:    global_store_byte v[16:17], v103, off offset:251
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v71
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 8, v71
+; ALIGNED-NEXT:    global_store_byte v[16:17], v30, off offset:249
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v30, 24, v70
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 8, v70
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v65, 8, v65
+; ALIGNED-NEXT:    global_store_byte v[16:17], v25, off offset:245
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v25, 24, v64
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v64, 8, v64
+; ALIGNED-NEXT:    global_store_byte v[16:17], v112, off offset:215
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 24, v67
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v67, 8, v67
+; ALIGNED-NEXT:    global_store_byte v[16:17], v113, off offset:243
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v87
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v87, 8, v87
+; ALIGNED-NEXT:    global_store_byte v[16:17], v24, off offset:241
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v24, 24, v86
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 8, v86
+; ALIGNED-NEXT:    global_store_byte v[16:17], v114, off offset:239
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v85
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v85, 8, v85
+; ALIGNED-NEXT:    global_store_byte v[16:17], v51, off offset:237
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 24, v80
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v80, 8, v80
+; ALIGNED-NEXT:    global_store_byte v[16:17], v115, off offset:235
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 24, v101
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 8, v101
+; ALIGNED-NEXT:    global_store_byte v[16:17], v50, off offset:233
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v50, 24, v99
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 8, v99
+; ALIGNED-NEXT:    global_store_byte v[16:17], v102, off offset:231
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v96
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v96, 8, v96
+; ALIGNED-NEXT:    global_store_byte v[16:17], v49, off offset:229
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v49, 24, v81
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v81, 8, v81
+; ALIGNED-NEXT:    global_store_byte v[16:17], v31, off offset:227
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v31, 24, v100
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 8, v100
+; ALIGNED-NEXT:    global_store_byte v[16:17], v36, off offset:225
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v36, 24, v97
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v97, 8, v97
+; ALIGNED-NEXT:    global_store_byte v[16:17], v103, off offset:223
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v82
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 8, v82
+; ALIGNED-NEXT:    global_store_byte v[16:17], v71, off offset:221
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 24, v66
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v66, 8, v66
+; ALIGNED-NEXT:    global_store_byte v[16:17], v30, off offset:219
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v30, 24, v98
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 8, v98
+; ALIGNED-NEXT:    global_store_byte v[16:17], v70, off offset:217
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 24, v83
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v83, 8, v83
+; ALIGNED-NEXT:    global_store_byte v[16:17], v65, off offset:213
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v65, 24, v52
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v52, 8, v52
+; ALIGNED-NEXT:    global_store_byte v[16:17], v25, off offset:211
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v25, 24, v84
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v84, 8, v84
+; ALIGNED-NEXT:    global_store_byte v[16:17], v64, off offset:209
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v64, 24, v68
+; ALIGNED-NEXT:    global_store_byte v[16:17], v67, off offset:149
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v67, 24, v8
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v8, 8, v8
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v68, 8, v68
+; ALIGNED-NEXT:    global_store_byte v[16:17], v113, off offset:207
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v53
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v53, 8, v53
+; ALIGNED-NEXT:    global_store_byte v[16:17], v87, off offset:205
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v87, 24, v37
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v37, 8, v37
+; ALIGNED-NEXT:    global_store_byte v[16:17], v24, off offset:203
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v24, 24, v69
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v69, 8, v69
+; ALIGNED-NEXT:    global_store_byte v[16:17], v86, off offset:201
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v54
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v54, 8, v54
+; ALIGNED-NEXT:    global_store_byte v[16:17], v114, off offset:199
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v38
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v38, 8, v38
+; ALIGNED-NEXT:    global_store_byte v[16:17], v85, off offset:197
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v85, 24, v32
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v32, 8, v32
+; ALIGNED-NEXT:    global_store_byte v[16:17], v51, off offset:195
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 24, v55
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v55, 8, v55
+; ALIGNED-NEXT:    global_store_byte v[16:17], v80, off offset:193
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v80, 24, v39
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v39, 8, v39
+; ALIGNED-NEXT:    global_store_byte v[16:17], v115, off offset:191
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 24, v33
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v33, 8, v33
+; ALIGNED-NEXT:    global_store_byte v[16:17], v101, off offset:189
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 24, v26
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v26, 8, v26
+; ALIGNED-NEXT:    global_store_byte v[16:17], v50, off offset:187
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v50, 24, v48
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v48, 8, v48
+; ALIGNED-NEXT:    global_store_byte v[16:17], v99, off offset:185
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 24, v34
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v34, 8, v34
+; ALIGNED-NEXT:    global_store_byte v[16:17], v102, off offset:183
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v27
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v27, 8, v27
+; ALIGNED-NEXT:    global_store_byte v[16:17], v96, off offset:181
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v96, 24, v21
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v21, 8, v21
+; ALIGNED-NEXT:    global_store_byte v[16:17], v49, off offset:179
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v49, 24, v35
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v35, 8, v35
+; ALIGNED-NEXT:    global_store_byte v[16:17], v81, off offset:177
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v81, 24, v28
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v28, 8, v28
+; ALIGNED-NEXT:    global_store_byte v[16:17], v31, off offset:175
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v31, 24, v22
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v22, 8, v22
+; ALIGNED-NEXT:    global_store_byte v[16:17], v100, off offset:173
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 24, v19
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v19, 8, v19
+; ALIGNED-NEXT:    global_store_byte v[16:17], v36, off offset:171
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v36, 24, v29
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v29, 8, v29
+; ALIGNED-NEXT:    global_store_byte v[16:17], v97, off offset:169
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v97, 24, v23
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v23, 8, v23
+; ALIGNED-NEXT:    global_store_byte v[16:17], v103, off offset:167
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v20
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v20, 8, v20
+; ALIGNED-NEXT:    global_store_byte v[16:17], v82, off offset:165
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v18
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v18, 8, v18
+; ALIGNED-NEXT:    global_store_byte v[16:17], v71, off offset:163
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 24, v15
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v15, 8, v15
+; ALIGNED-NEXT:    global_store_byte v[16:17], v66, off offset:161
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v66, 24, v14
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v14, 8, v14
+; ALIGNED-NEXT:    global_store_byte v[16:17], v30, off offset:159
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v30, 24, v13
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v13, 8, v13
+; ALIGNED-NEXT:    global_store_byte v[16:17], v98, off offset:157
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 24, v12
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v12, 8, v12
+; ALIGNED-NEXT:    global_store_byte v[16:17], v70, off offset:155
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 24, v11
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v11, 8, v11
+; ALIGNED-NEXT:    global_store_byte v[16:17], v83, off offset:153
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v83, 24, v10
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v10, 8, v10
+; ALIGNED-NEXT:    global_store_byte v[16:17], v112, off offset:151
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 24, v9
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v9, 8, v9
+; ALIGNED-NEXT:    global_store_byte v[16:17], v65, off offset:147
+; ALIGNED-NEXT:    global_store_byte v[16:17], v52, off offset:145
+; ALIGNED-NEXT:    global_store_byte v[16:17], v25, off offset:143
+; ALIGNED-NEXT:    global_store_byte v[16:17], v84, off offset:141
+; ALIGNED-NEXT:    global_store_byte v[16:17], v64, off offset:139
+; ALIGNED-NEXT:    global_store_byte v[16:17], v68, off offset:137
+; ALIGNED-NEXT:    global_store_byte v[16:17], v113, off offset:135
+; ALIGNED-NEXT:    global_store_byte v[16:17], v53, off offset:133
+; ALIGNED-NEXT:    global_store_byte v[16:17], v87, off offset:131
+; ALIGNED-NEXT:    global_store_byte v[16:17], v37, off offset:129
+; ALIGNED-NEXT:    global_store_byte v[16:17], v24, off offset:127
+; ALIGNED-NEXT:    global_store_byte v[16:17], v69, off offset:125
+; ALIGNED-NEXT:    global_store_byte v[16:17], v86, off offset:123
+; ALIGNED-NEXT:    global_store_byte v[16:17], v54, off offset:121
+; ALIGNED-NEXT:    global_store_byte v[16:17], v114, off offset:119
+; ALIGNED-NEXT:    global_store_byte v[16:17], v38, off offset:117
+; ALIGNED-NEXT:    global_store_byte v[16:17], v85, off offset:115
+; ALIGNED-NEXT:    global_store_byte v[16:17], v32, off offset:113
+; ALIGNED-NEXT:    global_store_byte v[16:17], v51, off offset:111
+; ALIGNED-NEXT:    global_store_byte v[16:17], v55, off offset:109
+; ALIGNED-NEXT:    global_store_byte v[16:17], v80, off offset:107
+; ALIGNED-NEXT:    global_store_byte v[16:17], v39, off offset:105
+; ALIGNED-NEXT:    global_store_byte v[16:17], v115, off offset:103
+; ALIGNED-NEXT:    global_store_byte v[16:17], v33, off offset:101
+; ALIGNED-NEXT:    global_store_byte v[16:17], v101, off offset:99
+; ALIGNED-NEXT:    global_store_byte v[16:17], v26, off offset:97
+; ALIGNED-NEXT:    global_store_byte v[16:17], v50, off offset:95
+; ALIGNED-NEXT:    global_store_byte v[16:17], v48, off offset:93
+; ALIGNED-NEXT:    global_store_byte v[16:17], v99, off offset:91
+; ALIGNED-NEXT:    global_store_byte v[16:17], v34, off offset:89
+; ALIGNED-NEXT:    global_store_byte v[16:17], v102, off offset:87
+; ALIGNED-NEXT:    global_store_byte v[16:17], v27, off offset:85
+; ALIGNED-NEXT:    global_store_byte v[16:17], v96, off offset:83
+; ALIGNED-NEXT:    global_store_byte v[16:17], v21, off offset:81
+; ALIGNED-NEXT:    global_store_byte v[16:17], v49, off offset:79
+; ALIGNED-NEXT:    global_store_byte v[16:17], v35, off offset:77
+; ALIGNED-NEXT:    global_store_byte v[16:17], v81, off offset:75
+; ALIGNED-NEXT:    global_store_byte v[16:17], v28, off offset:73
+; ALIGNED-NEXT:    global_store_byte v[16:17], v31, off offset:71
+; ALIGNED-NEXT:    global_store_byte v[16:17], v22, off offset:69
+; ALIGNED-NEXT:    global_store_byte v[16:17], v100, off offset:67
+; ALIGNED-NEXT:    global_store_byte v[16:17], v19, off offset:65
+; ALIGNED-NEXT:    global_store_byte v[16:17], v36, off offset:63
+; ALIGNED-NEXT:    global_store_byte v[16:17], v29, off offset:61
+; ALIGNED-NEXT:    global_store_byte v[16:17], v97, off offset:59
+; ALIGNED-NEXT:    global_store_byte v[16:17], v23, off offset:57
+; ALIGNED-NEXT:    global_store_byte v[16:17], v103, off offset:55
+; ALIGNED-NEXT:    global_store_byte v[16:17], v20, off offset:53
+; ALIGNED-NEXT:    global_store_byte v[16:17], v82, off offset:51
+; ALIGNED-NEXT:    global_store_byte v[16:17], v18, off offset:49
+; ALIGNED-NEXT:    global_store_byte v[16:17], v71, off offset:43
+; ALIGNED-NEXT:    global_store_byte v[16:17], v15, off offset:41
+; ALIGNED-NEXT:    global_store_byte v[16:17], v66, off offset:47
+; ALIGNED-NEXT:    global_store_byte v[16:17], v14, off offset:45
+; ALIGNED-NEXT:    global_store_byte v[16:17], v30, off offset:35
+; ALIGNED-NEXT:    global_store_byte v[16:17], v13, off offset:33
+; ALIGNED-NEXT:    global_store_byte v[16:17], v98, off offset:39
+; ALIGNED-NEXT:    global_store_byte v[16:17], v12, off offset:37
+; ALIGNED-NEXT:    global_store_byte v[16:17], v70, off offset:31
+; ALIGNED-NEXT:    global_store_byte v[16:17], v11, off offset:29
+; ALIGNED-NEXT:    global_store_byte v[16:17], v83, off offset:27
+; ALIGNED-NEXT:    global_store_byte v[16:17], v10, off offset:25
+; ALIGNED-NEXT:    global_store_byte v[16:17], v112, off offset:23
+; ALIGNED-NEXT:    global_store_byte v[16:17], v9, off offset:21
+; ALIGNED-NEXT:    global_store_byte v[16:17], v67, off offset:19
+; ALIGNED-NEXT:    global_store_byte v[16:17], v8, off offset:17
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v7, off offset:14
+; ALIGNED-NEXT:    global_store_byte v[16:17], v7, off offset:12
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v6, off offset:10
+; ALIGNED-NEXT:    global_store_byte v[16:17], v6, off offset:8
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v5, off offset:6
+; ALIGNED-NEXT:    global_store_byte v[16:17], v5, off offset:4
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v4, off offset:2
+; ALIGNED-NEXT:    global_store_byte v[16:17], v4, off
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v8, 24, v7
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v9, 24, v6
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
+; ALIGNED-NEXT:    global_store_byte v[16:17], v8, off offset:15
+; ALIGNED-NEXT:    global_store_byte v[16:17], v7, off offset:13
+; ALIGNED-NEXT:    global_store_byte v[16:17], v9, off offset:11
+; ALIGNED-NEXT:    global_store_byte v[16:17], v6, off offset:9
+; ALIGNED-NEXT:    global_store_byte v[16:17], v10, off offset:7
+; ALIGNED-NEXT:    global_store_byte v[16:17], v5, off offset:5
+; ALIGNED-NEXT:    global_store_byte v[16:17], v11, off offset:3
+; ALIGNED-NEXT:    global_store_byte v[16:17], v4, off offset:1
+; ALIGNED-NEXT:    s_cbranch_vccnz .LBB1_1
+; ALIGNED-NEXT:  ; %bb.2: ; %memcpy-split
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_dword v47, off, s[0:3], s32
+; ALIGNED-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4
+; ALIGNED-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8
+; ALIGNED-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12
+; ALIGNED-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:16
+; ALIGNED-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20
+; ALIGNED-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24
+; ALIGNED-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNROLL3-LABEL: memcpy_p1_p1_sz2048:
+; UNROLL3:       ; %bb.0: ; %entry
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNROLL3-NEXT:    s_mov_b64 s[4:5], 0
+; UNROLL3-NEXT:    .p2align 6
+; UNROLL3-NEXT:  .LBB1_1: ; %load-store-loop
+; UNROLL3-NEXT:    ; =>This Inner Loop Header: Depth=1
+; UNROLL3-NEXT:    v_add_co_u32 v12, vcc_lo, v2, s4
+; UNROLL3-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo
+; UNROLL3-NEXT:    v_add_co_u32 v16, vcc_lo, v0, s4
+; UNROLL3-NEXT:    s_add_u32 s4, s4, 48
+; UNROLL3-NEXT:    s_clause 0x2
+; UNROLL3-NEXT:    global_load_dwordx4 v[4:7], v[12:13], off
+; UNROLL3-NEXT:    global_load_dwordx4 v[8:11], v[12:13], off offset:16
+; UNROLL3-NEXT:    global_load_dwordx4 v[12:15], v[12:13], off offset:32
+; UNROLL3-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; UNROLL3-NEXT:    s_addc_u32 s5, s5, 0
+; UNROLL3-NEXT:    s_waitcnt vmcnt(2)
+; UNROLL3-NEXT:    global_store_dwordx4 v[16:17], v[4:7], off
+; UNROLL3-NEXT:    s_waitcnt vmcnt(1)
+; UNROLL3-NEXT:    global_store_dwordx4 v[16:17], v[8:11], off offset:16
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    global_store_dwordx4 v[16:17], v[12:15], off offset:32
+; UNROLL3-NEXT:    v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5]
+; UNROLL3-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
+; UNROLL3-NEXT:    s_cbranch_vccnz .LBB1_1
+; UNROLL3-NEXT:  ; %bb.2: ; %memcpy-split
+; UNROLL3-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:2016
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:2016
+; UNROLL3-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off offset:2032
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:2032
+; UNROLL3-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 2048, i1 false)
+  ret void
+}
+
+define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p4_sz2048:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:  .LBB2_1: ; %load-store-loop
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    v_add_co_u32 v96, vcc_lo, v2, s4
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v100, vcc_lo, v0, s4
+; CHECK-NEXT:    s_add_u32 s4, s4, 0x100
+; CHECK-NEXT:    s_clause 0xf
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[96:97], off offset:240
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[96:97], off offset:224
+; CHECK-NEXT:    global_load_dwordx4 v[12:15], v[96:97], off offset:208
+; CHECK-NEXT:    global_load_dwordx4 v[16:19], v[96:97], off offset:192
+; CHECK-NEXT:    global_load_dwordx4 v[20:23], v[96:97], off offset:176
+; CHECK-NEXT:    global_load_dwordx4 v[24:27], v[96:97], off offset:160
+; CHECK-NEXT:    global_load_dwordx4 v[28:31], v[96:97], off offset:144
+; CHECK-NEXT:    global_load_dwordx4 v[32:35], v[96:97], off offset:128
+; CHECK-NEXT:    global_load_dwordx4 v[36:39], v[96:97], off offset:112
+; CHECK-NEXT:    global_load_dwordx4 v[48:51], v[96:97], off offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[52:55], v[96:97], off offset:80
+; CHECK-NEXT:    global_load_dwordx4 v[64:67], v[96:97], off offset:64
+; CHECK-NEXT:    global_load_dwordx4 v[68:71], v[96:97], off offset:48
+; CHECK-NEXT:    global_load_dwordx4 v[80:83], v[96:97], off offset:32
+; CHECK-NEXT:    global_load_dwordx4 v[84:87], v[96:97], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[96:99], v[96:97], off
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT:    s_addc_u32 s5, s5, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[4:7] offset:240
+; CHECK-NEXT:    s_waitcnt vmcnt(14)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[8:11] offset:224
+; CHECK-NEXT:    s_waitcnt vmcnt(13)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[12:15] offset:208
+; CHECK-NEXT:    s_waitcnt vmcnt(12)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[16:19] offset:192
+; CHECK-NEXT:    s_waitcnt vmcnt(11)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[20:23] offset:176
+; CHECK-NEXT:    s_waitcnt vmcnt(10)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[24:27] offset:160
+; CHECK-NEXT:    s_waitcnt vmcnt(9)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[28:31] offset:144
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[32:35] offset:128
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[36:39] offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[48:51] offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[52:55] offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[64:67] offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[68:71] offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[80:83] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[84:87] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[96:99]
+; CHECK-NEXT:    v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
+; CHECK-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
+; CHECK-NEXT:    s_cbranch_vccnz .LBB2_1
+; CHECK-NEXT:  ; %bb.2: ; %memcpy-split
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; ALIGNED-LABEL: memcpy_p0_p4_sz2048:
+; ALIGNED:       ; %bb.0: ; %entry
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT:    s_mov_b64 s[4:5], 0
+; ALIGNED-NEXT:  .LBB2_1: ; %load-store-loop
+; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ALIGNED-NEXT:    v_add_co_u32 v4, vcc_lo, v2, s4
+; ALIGNED-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s5, v3, vcc_lo
+; ALIGNED-NEXT:    v_add_co_u32 v96, vcc_lo, v0, s4
+; ALIGNED-NEXT:    v_add_co_ci_u32_e32 v97, vcc_lo, s5, v1, vcc_lo
+; ALIGNED-NEXT:    s_clause 0xf
+; ALIGNED-NEXT:    global_load_dwordx4 v[98:101], v[4:5], off offset:240
+; ALIGNED-NEXT:    global_load_dwordx4 v[84:87], v[4:5], off offset:224
+; ALIGNED-NEXT:    global_load_dwordx4 v[80:83], v[4:5], off offset:208
+; ALIGNED-NEXT:    global_load_dwordx4 v[68:71], v[4:5], off offset:192
+; ALIGNED-NEXT:    global_load_dwordx4 v[64:67], v[4:5], off offset:176
+; ALIGNED-NEXT:    global_load_dwordx4 v[52:55], v[4:5], off offset:160
+; ALIGNED-NEXT:    global_load_dwordx4 v[48:51], v[4:5], off offset:144
+; ALIGNED-NEXT:    global_load_dwordx4 v[36:39], v[4:5], off offset:128
+; ALIGNED-NEXT:    global_load_dwordx4 v[32:35], v[4:5], off offset:112
+; ALIGNED-NEXT:    global_load_dwordx4 v[28:31], v[4:5], off offset:96
+; ALIGNED-NEXT:    global_load_dwordx4 v[24:27], v[4:5], off offset:80
+; ALIGNED-NEXT:    global_load_dwordx4 v[20:23], v[4:5], off offset:64
+; ALIGNED-NEXT:    global_load_dwordx4 v[16:19], v[4:5], off offset:48
+; ALIGNED-NEXT:    global_load_dwordx4 v[12:15], v[4:5], off offset:32
+; ALIGNED-NEXT:    global_load_dwordx4 v[8:11], v[4:5], off offset:16
+; ALIGNED-NEXT:    global_load_dwordx4 v[4:7], v[4:5], off
+; ALIGNED-NEXT:    s_add_u32 s4, s4, 0x100
+; ALIGNED-NEXT:    s_addc_u32 s5, s5, 0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(15)
+; ALIGNED-NEXT:    buffer_store_dword v100, off, s[0:3], s32 offset:168
+; ALIGNED-NEXT:    buffer_store_dword v101, off, s[0:3], s32 offset:172
+; ALIGNED-NEXT:    buffer_store_dword v99, off, s[0:3], s32 offset:164
+; ALIGNED-NEXT:    buffer_store_dword v98, off, s[0:3], s32 offset:160
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v100 offset:250
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v101 offset:254
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v101 offset:252
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v100 offset:248
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v99 offset:246
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v99 offset:244
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v98 offset:242
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v98 offset:240
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v100
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 8, v100
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v101
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 8, v101
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 24, v99
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 8, v99
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 24, v98
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 8, v98
+; ALIGNED-NEXT:    s_waitcnt vmcnt(14)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v86
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 8, v86
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v113 offset:251
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v87
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v112 offset:249
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 8, v87
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v103 offset:255
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v85
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v102 offset:253
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 8, v85
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v101 offset:247
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 24, v84
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v100 offset:245
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 8, v84
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v99 offset:243
+; ALIGNED-NEXT:    s_waitcnt vmcnt(13)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 24, v82
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v98 offset:241
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 8, v82
+; ALIGNED-NEXT:    buffer_store_dword v86, off, s[0:3], s32 offset:184
+; ALIGNED-NEXT:    buffer_store_dword v87, off, s[0:3], s32 offset:188
+; ALIGNED-NEXT:    buffer_store_dword v85, off, s[0:3], s32 offset:180
+; ALIGNED-NEXT:    buffer_store_dword v84, off, s[0:3], s32 offset:176
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v86 offset:234
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v87 offset:238
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v87 offset:236
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v86 offset:232
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v85 offset:230
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v85 offset:228
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v84 offset:226
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v84 offset:224
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v84, 24, v83
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v85, 8, v83
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v81
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v87, 8, v81
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v114 offset:235
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v80
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v115 offset:233
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 8, v80
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v113 offset:239
+; ALIGNED-NEXT:    s_waitcnt vmcnt(12)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v70
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v112 offset:237
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 8, v70
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v103 offset:231
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v71
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v102 offset:229
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 8, v71
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v101 offset:227
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 24, v69
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v100 offset:225
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 8, v69
+; ALIGNED-NEXT:    buffer_store_dword v82, off, s[0:3], s32 offset:136
+; ALIGNED-NEXT:    buffer_store_dword v83, off, s[0:3], s32 offset:140
+; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:132
+; ALIGNED-NEXT:    buffer_store_dword v80, off, s[0:3], s32 offset:128
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v82 offset:218
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v83 offset:222
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v83 offset:220
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v82 offset:216
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v81 offset:214
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v81 offset:212
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v80 offset:210
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v80 offset:208
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v80, 24, v68
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v81, 8, v68
+; ALIGNED-NEXT:    s_waitcnt vmcnt(11)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v66
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v83, 8, v66
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v99 offset:219
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 24, v67
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v98 offset:217
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 8, v67
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v84 offset:223
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v84, 24, v65
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v85 offset:221
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v85, 8, v65
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v86 offset:215
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v64
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v87 offset:213
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v87, 8, v64
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v114 offset:211
+; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v54
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v115 offset:209
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 8, v54
+; ALIGNED-NEXT:    buffer_store_dword v70, off, s[0:3], s32 offset:152
+; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:156
+; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:148
+; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:144
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v70 offset:202
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v71 offset:206
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v71 offset:204
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v70 offset:200
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v69 offset:198
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v69 offset:196
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v68 offset:194
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v68 offset:192
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v69, 8, v55
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 24, v52
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 8, v52
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v68, 24, v55
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v113 offset:203
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v53
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v112 offset:201
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 8, v53
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v103 offset:207
+; ALIGNED-NEXT:    s_waitcnt vmcnt(9)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v50
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v102 offset:205
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 8, v50
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v101 offset:199
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 24, v51
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v100 offset:197
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 8, v51
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v80 offset:195
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v80, 24, v49
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v81 offset:193
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v81, 8, v49
+; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:232
+; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:236
+; ALIGNED-NEXT:    buffer_store_dword v65, off, s[0:3], s32 offset:228
+; ALIGNED-NEXT:    buffer_store_dword v64, off, s[0:3], s32 offset:224
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v66 offset:186
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v67 offset:190
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v67 offset:188
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v66 offset:184
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v65 offset:182
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v65 offset:180
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v64 offset:178
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v64 offset:176
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v64, 24, v48
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v65, 8, v48
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v82 offset:187
+; ALIGNED-NEXT:    s_waitcnt vmcnt(8)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v39
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v66, 24, v38
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v67, 8, v38
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v83 offset:185
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v83, 8, v39
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v99 offset:191
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 24, v37
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v98 offset:189
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 8, v37
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v84 offset:183
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v84, 24, v36
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v85 offset:181
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v85, 8, v36
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v86 offset:179
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v34
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v87 offset:177
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v87, 8, v34
+; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:252
+; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:244
+; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:240
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v54 offset:170
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v54 offset:168
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v55 offset:174
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v55 offset:172
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v52 offset:162
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v52 offset:160
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v53 offset:166
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v53 offset:164
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v53, 24, v35
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v54, 8, v35
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v55, 24, v33
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v116, 8, v33
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v114 offset:171
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v32
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v115 offset:169
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 8, v32
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v69 offset:173
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v69, 24, v31
+; ALIGNED-NEXT:    v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v70 offset:163
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 8, v31
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v71 offset:161
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 24, v29
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v68 offset:175
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v52, 24, v30
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v68, 8, v30
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v113 offset:167
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v112 offset:165
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 8, v29
+; ALIGNED-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:204
+; ALIGNED-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:196
+; ALIGNED-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v50 offset:154
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v51 offset:158
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v51 offset:156
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v50 offset:152
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v49 offset:150
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v49 offset:148
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v48 offset:146
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v48 offset:144
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v48, 24, v28
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v49, 8, v28
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v50, 24, v26
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v103 offset:155
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v102 offset:153
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v101 offset:159
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v100 offset:157
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v80 offset:151
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v81 offset:149
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v64 offset:147
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v65 offset:145
+; ALIGNED-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:216
+; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:220
+; ALIGNED-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:212
+; ALIGNED-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v38 offset:138
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v39 offset:142
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v39 offset:140
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v38 offset:136
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v37 offset:134
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v37 offset:132
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v36 offset:130
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v36 offset:128
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v82 offset:143
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v18
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 8, v26
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v66 offset:139
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v67 offset:137
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v83 offset:141
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v99 offset:135
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v98 offset:133
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v84 offset:131
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v85 offset:129
+; ALIGNED-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:40
+; ALIGNED-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:44
+; ALIGNED-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:36
+; ALIGNED-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:32
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v34 offset:122
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v35 offset:126
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v35 offset:124
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v34 offset:120
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v33 offset:118
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v33 offset:116
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v32 offset:114
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v32 offset:112
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v34, 24, v14
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v86 offset:123
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v87 offset:121
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v53 offset:127
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v54 offset:125
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v55 offset:119
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v116 offset:117
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v114 offset:115
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v10
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v115 offset:113
+; ALIGNED-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:56
+; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:60
+; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:52
+; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:48
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v30 offset:106
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v31 offset:110
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v31 offset:108
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v30 offset:104
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v29 offset:102
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v29 offset:100
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v28 offset:98
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v28 offset:96
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v69 offset:111
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v69, 24, v6
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v27
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v83, 8, v18
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v35, 8, v14
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 8, v10
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v70 offset:109
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 8, v6
+; ALIGNED-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 8, v27
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 24, v25
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 24, v19
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v15
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v28, 24, v11
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v71 offset:103
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 24, v7
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 8, v25
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v80, 24, v24
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v81, 8, v24
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v64, 24, v22
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v65, 8, v22
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v36, 24, v23
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v37, 8, v23
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v38, 24, v21
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v39, 8, v21
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v66, 24, v20
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v67, 8, v20
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 8, v19
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v84, 24, v17
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v85, 8, v17
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v32, 24, v16
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v87, 8, v15
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v53, 24, v13
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v54, 8, v13
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v55, 24, v12
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 8, v12
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v29, 8, v11
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v30, 24, v9
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v31, 8, v9
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v52 offset:107
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v52, 24, v8
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v68 offset:105
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v68, 8, v8
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v112 offset:101
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 8, v7
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v48 offset:99
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v48, 24, v5
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v49 offset:97
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v49, 8, v5
+; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:8
+; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:12
+; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:4
+; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v26 offset:90
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v27 offset:94
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v27 offset:92
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v26 offset:88
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v25 offset:86
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v25 offset:84
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v24 offset:82
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v24 offset:80
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v24, 24, v4
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v25, 8, v4
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v50 offset:91
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v51 offset:89
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v103 offset:95
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v102 offset:93
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v101 offset:87
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v100 offset:85
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v80 offset:83
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v81 offset:81
+; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:24
+; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:28
+; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:20
+; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:16
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v22 offset:74
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v23 offset:78
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v23 offset:76
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v22 offset:72
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v21 offset:70
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v21 offset:68
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v20 offset:66
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v20 offset:64
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v64 offset:75
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v65 offset:73
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v36 offset:79
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v37 offset:77
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v38 offset:71
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v39 offset:69
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v66 offset:67
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v67 offset:65
+; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:104
+; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:108
+; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:100
+; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:96
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v82 offset:59
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v18 offset:58
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v83 offset:57
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v19 offset:62
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v99 offset:63
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v19 offset:60
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v98 offset:61
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v18 offset:56
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v84 offset:55
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v17 offset:54
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v85 offset:53
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v17 offset:52
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v32 offset:51
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v16 offset:50
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v33 offset:49
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v16 offset:48
+; ALIGNED-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:120
+; ALIGNED-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:124
+; ALIGNED-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:116
+; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:112
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v14 offset:42
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v34 offset:43
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v35 offset:41
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v15 offset:46
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v86 offset:47
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v15 offset:44
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v87 offset:45
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v14 offset:40
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v53 offset:39
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v13 offset:38
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v54 offset:37
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v13 offset:36
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v55 offset:35
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v12 offset:34
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v113 offset:33
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v12 offset:32
+; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:72
+; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:76
+; ALIGNED-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:68
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:64
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v10 offset:26
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v114 offset:27
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v115 offset:25
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v11 offset:30
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v28 offset:31
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v11 offset:28
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v29 offset:29
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v10 offset:24
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v30 offset:23
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v9 offset:22
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v31 offset:21
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v9 offset:20
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v52 offset:19
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v8 offset:18
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v68 offset:17
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v8 offset:16
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:88
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:92
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:84
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:80
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v6 offset:10
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v69 offset:11
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v70 offset:9
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v7 offset:14
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v71 offset:15
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v7 offset:12
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v112 offset:13
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v6 offset:8
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v48 offset:7
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v5 offset:6
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v49 offset:5
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v5 offset:4
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v24 offset:3
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v4 offset:2
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v25 offset:1
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v4
+; ALIGNED-NEXT:    s_cbranch_vccnz .LBB2_1
+; ALIGNED-NEXT:  ; %bb.2: ; %memcpy-split
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
+; ALIGNED-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNROLL3-LABEL: memcpy_p0_p4_sz2048:
+; UNROLL3:       ; %bb.0: ; %entry
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNROLL3-NEXT:    s_mov_b64 s[4:5], 0
+; UNROLL3-NEXT:    .p2align 6
+; UNROLL3-NEXT:  .LBB2_1: ; %load-store-loop
+; UNROLL3-NEXT:    ; =>This Inner Loop Header: Depth=1
+; UNROLL3-NEXT:    v_add_co_u32 v12, vcc_lo, v2, s4
+; UNROLL3-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo
+; UNROLL3-NEXT:    v_add_co_u32 v16, vcc_lo, v0, s4
+; UNROLL3-NEXT:    s_add_u32 s4, s4, 48
+; UNROLL3-NEXT:    s_clause 0x2
+; UNROLL3-NEXT:    global_load_dwordx4 v[4:7], v[12:13], off offset:16
+; UNROLL3-NEXT:    global_load_dwordx4 v[8:11], v[12:13], off
+; UNROLL3-NEXT:    global_load_dwordx4 v[12:15], v[12:13], off offset:32
+; UNROLL3-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; UNROLL3-NEXT:    s_addc_u32 s5, s5, 0
+; UNROLL3-NEXT:    s_waitcnt vmcnt(2)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[4:7] offset:16
+; UNROLL3-NEXT:    s_waitcnt vmcnt(1)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[12:15] offset:32
+; UNROLL3-NEXT:    v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5]
+; UNROLL3-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
+; UNROLL3-NEXT:    s_cbranch_vccnz .LBB2_1
+; UNROLL3-NEXT:  ; %bb.2: ; %memcpy-split
+; UNROLL3-NEXT:    s_clause 0x1
+; UNROLL3-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:2016
+; UNROLL3-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off offset:2032
+; UNROLL3-NEXT:    s_waitcnt vmcnt(1)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:2016
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[0:1], v[8:11] offset:2032
+; UNROLL3-NEXT:    s_waitcnt lgkmcnt(0)
+; UNROLL3-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 2048, i1 false)
+  ret void
+}
+
+define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p5_sz2048:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:  .LBB3_1: ; %load-store-loop
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_clause 0x3e
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:252
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:248
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:244
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:240
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:236
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:232
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:228
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:224
+; CHECK-NEXT:    buffer_load_dword v10, v1, s[0:3], 0 offen offset:220
+; CHECK-NEXT:    buffer_load_dword v11, v1, s[0:3], 0 offen offset:216
+; CHECK-NEXT:    buffer_load_dword v12, v1, s[0:3], 0 offen offset:212
+; CHECK-NEXT:    buffer_load_dword v13, v1, s[0:3], 0 offen offset:208
+; CHECK-NEXT:    buffer_load_dword v14, v1, s[0:3], 0 offen offset:204
+; CHECK-NEXT:    buffer_load_dword v15, v1, s[0:3], 0 offen offset:200
+; CHECK-NEXT:    buffer_load_dword v16, v1, s[0:3], 0 offen offset:196
+; CHECK-NEXT:    buffer_load_dword v17, v1, s[0:3], 0 offen offset:192
+; CHECK-NEXT:    buffer_load_dword v18, v1, s[0:3], 0 offen offset:188
+; CHECK-NEXT:    buffer_load_dword v19, v1, s[0:3], 0 offen offset:184
+; CHECK-NEXT:    buffer_load_dword v20, v1, s[0:3], 0 offen offset:180
+; CHECK-NEXT:    buffer_load_dword v21, v1, s[0:3], 0 offen offset:176
+; CHECK-NEXT:    buffer_load_dword v22, v1, s[0:3], 0 offen offset:172
+; CHECK-NEXT:    buffer_load_dword v23, v1, s[0:3], 0 offen offset:168
+; CHECK-NEXT:    buffer_load_dword v24, v1, s[0:3], 0 offen offset:164
+; CHECK-NEXT:    buffer_load_dword v25, v1, s[0:3], 0 offen offset:160
+; CHECK-NEXT:    buffer_load_dword v26, v1, s[0:3], 0 offen offset:156
+; CHECK-NEXT:    buffer_load_dword v27, v1, s[0:3], 0 offen offset:152
+; CHECK-NEXT:    buffer_load_dword v28, v1, s[0:3], 0 offen offset:148
+; CHECK-NEXT:    buffer_load_dword v29, v1, s[0:3], 0 offen offset:144
+; CHECK-NEXT:    buffer_load_dword v30, v1, s[0:3], 0 offen offset:140
+; CHECK-NEXT:    buffer_load_dword v31, v1, s[0:3], 0 offen offset:136
+; CHECK-NEXT:    buffer_load_dword v32, v1, s[0:3], 0 offen offset:132
+; CHECK-NEXT:    buffer_load_dword v33, v1, s[0:3], 0 offen offset:128
+; CHECK-NEXT:    buffer_load_dword v34, v1, s[0:3], 0 offen offset:124
+; CHECK-NEXT:    buffer_load_dword v35, v1, s[0:3], 0 offen offset:120
+; CHECK-NEXT:    buffer_load_dword v36, v1, s[0:3], 0 offen offset:116
+; CHECK-NEXT:    buffer_load_dword v37, v1, s[0:3], 0 offen offset:112
+; CHECK-NEXT:    buffer_load_dword v38, v1, s[0:3], 0 offen offset:108
+; CHECK-NEXT:    buffer_load_dword v39, v1, s[0:3], 0 offen offset:104
+; CHECK-NEXT:    buffer_load_dword v48, v1, s[0:3], 0 offen offset:100
+; CHECK-NEXT:    buffer_load_dword v49, v1, s[0:3], 0 offen offset:96
+; CHECK-NEXT:    buffer_load_dword v50, v1, s[0:3], 0 offen offset:92
+; CHECK-NEXT:    buffer_load_dword v51, v1, s[0:3], 0 offen offset:88
+; CHECK-NEXT:    buffer_load_dword v52, v1, s[0:3], 0 offen offset:84
+; CHECK-NEXT:    buffer_load_dword v53, v1, s[0:3], 0 offen offset:80
+; CHECK-NEXT:    buffer_load_dword v54, v1, s[0:3], 0 offen offset:76
+; CHECK-NEXT:    buffer_load_dword v55, v1, s[0:3], 0 offen offset:72
+; CHECK-NEXT:    buffer_load_dword v64, v1, s[0:3], 0 offen offset:68
+; CHECK-NEXT:    buffer_load_dword v65, v1, s[0:3], 0 offen offset:64
+; CHECK-NEXT:    buffer_load_dword v66, v1, s[0:3], 0 offen offset:60
+; CHECK-NEXT:    buffer_load_dword v67, v1, s[0:3], 0 offen offset:56
+; CHECK-NEXT:    buffer_load_dword v68, v1, s[0:3], 0 offen offset:52
+; CHECK-NEXT:    buffer_load_dword v69, v1, s[0:3], 0 offen offset:48
+; CHECK-NEXT:    buffer_load_dword v70, v1, s[0:3], 0 offen offset:44
+; CHECK-NEXT:    buffer_load_dword v71, v1, s[0:3], 0 offen offset:40
+; CHECK-NEXT:    buffer_load_dword v80, v1, s[0:3], 0 offen offset:36
+; CHECK-NEXT:    buffer_load_dword v81, v1, s[0:3], 0 offen offset:32
+; CHECK-NEXT:    buffer_load_dword v82, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v83, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v84, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v85, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v86, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v87, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v96, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v97, v1, s[0:3], 0 offen
+; CHECK-NEXT:    s_add_u32 s4, s4, 0x100
+; CHECK-NEXT:    s_addc_u32 s5, s5, 0
+; CHECK-NEXT:    v_add_nc_u32_e32 v1, 0x100, v1
+; CHECK-NEXT:    v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
+; CHECK-NEXT:    s_waitcnt vmcnt(62)
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:252
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:248
+; CHECK-NEXT:    s_waitcnt vmcnt(61)
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:244
+; CHECK-NEXT:    s_waitcnt vmcnt(60)
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:240
+; CHECK-NEXT:    s_waitcnt vmcnt(59)
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:236
+; CHECK-NEXT:    s_waitcnt vmcnt(58)
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:232
+; CHECK-NEXT:    s_waitcnt vmcnt(57)
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:228
+; CHECK-NEXT:    s_waitcnt vmcnt(56)
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:224
+; CHECK-NEXT:    s_waitcnt vmcnt(55)
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:220
+; CHECK-NEXT:    s_waitcnt vmcnt(54)
+; CHECK-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:216
+; CHECK-NEXT:    s_waitcnt vmcnt(53)
+; CHECK-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:212
+; CHECK-NEXT:    s_waitcnt vmcnt(52)
+; CHECK-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:208
+; CHECK-NEXT:    s_waitcnt vmcnt(51)
+; CHECK-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:204
+; CHECK-NEXT:    s_waitcnt vmcnt(50)
+; CHECK-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:200
+; CHECK-NEXT:    s_waitcnt vmcnt(49)
+; CHECK-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:196
+; CHECK-NEXT:    s_waitcnt vmcnt(48)
+; CHECK-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:192
+; CHECK-NEXT:    s_waitcnt vmcnt(47)
+; CHECK-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:188
+; CHECK-NEXT:    s_waitcnt vmcnt(46)
+; CHECK-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:184
+; CHECK-NEXT:    s_waitcnt vmcnt(45)
+; CHECK-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:180
+; CHECK-NEXT:    s_waitcnt vmcnt(44)
+; CHECK-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:176
+; CHECK-NEXT:    s_waitcnt vmcnt(43)
+; CHECK-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:172
+; CHECK-NEXT:    s_waitcnt vmcnt(42)
+; CHECK-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:168
+; CHECK-NEXT:    s_waitcnt vmcnt(41)
+; CHECK-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:164
+; CHECK-NEXT:    s_waitcnt vmcnt(40)
+; CHECK-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:160
+; CHECK-NEXT:    s_waitcnt vmcnt(39)
+; CHECK-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:156
+; CHECK-NEXT:    s_waitcnt vmcnt(38)
+; CHECK-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:152
+; CHECK-NEXT:    s_waitcnt vmcnt(37)
+; CHECK-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:148
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:144
+; CHECK-NEXT:    s_waitcnt vmcnt(35)
+; CHECK-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:140
+; CHECK-NEXT:    s_waitcnt vmcnt(34)
+; CHECK-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:136
+; CHECK-NEXT:    s_waitcnt vmcnt(33)
+; CHECK-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:132
+; CHECK-NEXT:    s_waitcnt vmcnt(32)
+; CHECK-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:128
+; CHECK-NEXT:    s_waitcnt vmcnt(31)
+; CHECK-NEXT:    buffer_store_dword v34, v0, s[0:3], 0 offen offset:124
+; CHECK-NEXT:    s_waitcnt vmcnt(30)
+; CHECK-NEXT:    buffer_store_dword v35, v0, s[0:3], 0 offen offset:120
+; CHECK-NEXT:    s_waitcnt vmcnt(29)
+; CHECK-NEXT:    buffer_store_dword v36, v0, s[0:3], 0 offen offset:116
+; CHECK-NEXT:    s_waitcnt vmcnt(28)
+; CHECK-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(27)
+; CHECK-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen offset:108
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_dword v39, v0, s[0:3], 0 offen offset:104
+; CHECK-NEXT:    s_waitcnt vmcnt(25)
+; CHECK-NEXT:    buffer_store_dword v48, v0, s[0:3], 0 offen offset:100
+; CHECK-NEXT:    s_waitcnt vmcnt(24)
+; CHECK-NEXT:    buffer_store_dword v49, v0, s[0:3], 0 offen offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(23)
+; CHECK-NEXT:    buffer_store_dword v50, v0, s[0:3], 0 offen offset:92
+; CHECK-NEXT:    s_waitcnt vmcnt(22)
+; CHECK-NEXT:    buffer_store_dword v51, v0, s[0:3], 0 offen offset:88
+; CHECK-NEXT:    s_waitcnt vmcnt(21)
+; CHECK-NEXT:    buffer_store_dword v52, v0, s[0:3], 0 offen offset:84
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
+; CHECK-NEXT:    buffer_store_dword v53, v0, s[0:3], 0 offen offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(19)
+; CHECK-NEXT:    buffer_store_dword v54, v0, s[0:3], 0 offen offset:76
+; CHECK-NEXT:    s_waitcnt vmcnt(18)
+; CHECK-NEXT:    buffer_store_dword v55, v0, s[0:3], 0 offen offset:72
+; CHECK-NEXT:    s_waitcnt vmcnt(17)
+; CHECK-NEXT:    buffer_store_dword v64, v0, s[0:3], 0 offen offset:68
+; CHECK-NEXT:    s_waitcnt vmcnt(16)
+; CHECK-NEXT:    buffer_store_dword v65, v0, s[0:3], 0 offen offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(15)
+; CHECK-NEXT:    buffer_store_dword v66, v0, s[0:3], 0 offen offset:60
+; CHECK-NEXT:    s_waitcnt vmcnt(14)
+; CHECK-NEXT:    buffer_store_dword v67, v0, s[0:3], 0 offen offset:56
+; CHECK-NEXT:    s_waitcnt vmcnt(13)
+; CHECK-NEXT:    buffer_store_dword v68, v0, s[0:3], 0 offen offset:52
+; CHECK-NEXT:    s_waitcnt vmcnt(12)
+; CHECK-NEXT:    buffer_store_dword v69, v0, s[0:3], 0 offen offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(11)
+; CHECK-NEXT:    buffer_store_dword v70, v0, s[0:3], 0 offen offset:44
+; CHECK-NEXT:    s_waitcnt vmcnt(10)
+; CHECK-NEXT:    buffer_store_dword v71, v0, s[0:3], 0 offen offset:40
+; CHECK-NEXT:    s_waitcnt vmcnt(9)
+; CHECK-NEXT:    buffer_store_dword v80, v0, s[0:3], 0 offen offset:36
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
+; CHECK-NEXT:    buffer_store_dword v81, v0, s[0:3], 0 offen offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
+; CHECK-NEXT:    buffer_store_dword v82, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
+; CHECK-NEXT:    buffer_store_dword v83, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    buffer_store_dword v84, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    buffer_store_dword v85, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    buffer_store_dword v86, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    buffer_store_dword v87, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    buffer_store_dword v96, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    buffer_store_dword v97, v0, s[0:3], 0 offen
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, 0x100, v0
+; CHECK-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
+; CHECK-NEXT:    s_cbranch_vccnz .LBB3_1
+; CHECK-NEXT:  ; %bb.2: ; %memcpy-split
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; ALIGNED-LABEL: memcpy_p5_p5_sz2048:
+; ALIGNED:       ; %bb.0: ; %entry
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT:    s_mov_b64 s[4:5], 0
+; ALIGNED-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v72, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v73, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v74, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v75, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v76, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v77, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v78, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v79, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v88, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v89, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v90, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v91, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v92, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v93, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v94, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v95, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v104, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v105, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v106, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v107, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v108, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v109, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v110, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v111, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v120, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v121, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v122, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v123, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v124, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v125, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v126, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill
+; ALIGNED-NEXT:  .LBB3_1: ; %load-store-loop
+; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ALIGNED-NEXT:    s_clause 0x34
+; ALIGNED-NEXT:    buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT:    buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT:    buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT:    buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT:    buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:251
+; ALIGNED-NEXT:    buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT:    buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT:    buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT:    buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:247
+; ALIGNED-NEXT:    buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT:    buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT:    buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT:    buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT:    buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT:    buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT:    buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT:    buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT:    buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT:    buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT:    buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT:    buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT:    buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT:    buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT:    buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT:    buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT:    buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT:    buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT:    buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT:    buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT:    buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT:    buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT:    buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT:    buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT:    buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT:    buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT:    buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT:    buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT:    buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT:    buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT:    buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT:    buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT:    buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT:    buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT:    buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT:    buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT:    buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT:    buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT:    buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT:    buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT:    buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT:    buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT:    buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT:    s_add_u32 s4, s4, 0x100
+; ALIGNED-NEXT:    s_addc_u32 s5, s5, 0
+; ALIGNED-NEXT:    v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
+; ALIGNED-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x3e
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT:    buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT:    buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT:    buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT:    buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT:    buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT:    buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT:    buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT:    buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT:    buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT:    buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT:    buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT:    buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT:    buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT:    buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT:    buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT:    buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT:    buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT:    buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT:    buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT:    buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:157
+; ALIGNED-NEXT:    buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:156
+; ALIGNED-NEXT:    buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT:    buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT:    buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT:    buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT:    buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:151
+; ALIGNED-NEXT:    buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:150
+; ALIGNED-NEXT:    buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:149
+; ALIGNED-NEXT:    buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:148
+; ALIGNED-NEXT:    buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:147
+; ALIGNED-NEXT:    buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:146
+; ALIGNED-NEXT:    buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:145
+; ALIGNED-NEXT:    buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:144
+; ALIGNED-NEXT:    buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:143
+; ALIGNED-NEXT:    buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:142
+; ALIGNED-NEXT:    buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:141
+; ALIGNED-NEXT:    buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:140
+; ALIGNED-NEXT:    s_clause 0xa
+; ALIGNED-NEXT:    buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:139
+; ALIGNED-NEXT:    buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:138
+; ALIGNED-NEXT:    buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:137
+; ALIGNED-NEXT:    buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:136
+; ALIGNED-NEXT:    buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:135
+; ALIGNED-NEXT:    buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:134
+; ALIGNED-NEXT:    buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:133
+; ALIGNED-NEXT:    buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:132
+; ALIGNED-NEXT:    buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:131
+; ALIGNED-NEXT:    buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:130
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:129
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:128
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:127
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:125
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:124
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:123
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:122
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:121
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:120
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:119
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:118
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:117
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:116
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:115
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:114
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:113
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:112
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:111
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:110
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:109
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:108
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:107
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:106
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:105
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:104
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:103
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:102
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:101
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:100
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:99
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:98
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:97
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:96
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:95
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:94
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:93
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:92
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:91
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:90
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:89
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:88
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:87
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:86
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:83
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:82
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:81
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:80
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:79
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:78
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:77
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:76
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:74
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:72
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:71
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:70
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:69
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:68
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:67
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:66
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:65
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:64
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:63
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:62
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:61
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:60
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:59
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:55
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:54
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:53
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:52
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:51
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:50
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:49
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:48
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:47
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:46
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:45
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:44
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:43
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:42
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:41
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:40
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:39
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:38
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:37
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:36
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:35
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:34
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:33
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:32
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:29
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:28
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:23
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:22
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:21
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:20
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:18
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:16
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:11
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
+; ALIGNED-NEXT:    v_add_nc_u32_e32 v1, 0x100, v1
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_byte v116, v0, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT:    buffer_store_byte v117, v0, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT:    buffer_store_byte v118, v0, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT:    buffer_store_byte v119, v0, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT:    buffer_store_byte v40, v0, s[0:3], 0 offen offset:251
+; ALIGNED-NEXT:    buffer_store_byte v41, v0, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT:    buffer_store_byte v42, v0, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT:    buffer_store_byte v43, v0, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT:    buffer_store_byte v44, v0, s[0:3], 0 offen offset:247
+; ALIGNED-NEXT:    buffer_store_byte v45, v0, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT:    buffer_store_byte v46, v0, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT:    buffer_store_byte v47, v0, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT:    buffer_store_byte v56, v0, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT:    buffer_store_byte v57, v0, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT:    buffer_store_byte v58, v0, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT:    buffer_store_byte v59, v0, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT:    buffer_store_byte v60, v0, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT:    buffer_store_byte v61, v0, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT:    buffer_store_byte v62, v0, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT:    buffer_store_byte v63, v0, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT:    buffer_store_byte v72, v0, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT:    buffer_store_byte v73, v0, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT:    buffer_store_byte v74, v0, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT:    buffer_store_byte v75, v0, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT:    buffer_store_byte v76, v0, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT:    buffer_store_byte v77, v0, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT:    buffer_store_byte v78, v0, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT:    buffer_store_byte v79, v0, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT:    buffer_store_byte v88, v0, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT:    buffer_store_byte v89, v0, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT:    buffer_store_byte v90, v0, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT:    buffer_store_byte v91, v0, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT:    buffer_store_byte v92, v0, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT:    buffer_store_byte v93, v0, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT:    buffer_store_byte v94, v0, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT:    buffer_store_byte v95, v0, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT:    buffer_store_byte v104, v0, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT:    buffer_store_byte v105, v0, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT:    buffer_store_byte v106, v0, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT:    buffer_store_byte v107, v0, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT:    buffer_store_byte v108, v0, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT:    buffer_store_byte v109, v0, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT:    buffer_store_byte v110, v0, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT:    buffer_store_byte v111, v0, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT:    buffer_store_byte v120, v0, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT:    buffer_store_byte v121, v0, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT:    buffer_store_byte v122, v0, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT:    buffer_store_byte v123, v0, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT:    buffer_store_byte v124, v0, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT:    buffer_store_byte v125, v0, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT:    buffer_store_byte v126, v0, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT:    buffer_store_byte v127, v0, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT:    buffer_store_byte v33, v0, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT:    buffer_store_byte v34, v0, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT:    buffer_store_byte v35, v0, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT:    buffer_store_byte v36, v0, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT:    buffer_store_byte v37, v0, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT:    buffer_store_byte v38, v0, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT:    buffer_store_byte v39, v0, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT:    buffer_store_byte v48, v0, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT:    buffer_store_byte v49, v0, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT:    buffer_store_byte v50, v0, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT:    buffer_store_byte v51, v0, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT:    buffer_store_byte v52, v0, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT:    buffer_store_byte v53, v0, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT:    buffer_store_byte v54, v0, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT:    buffer_store_byte v55, v0, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT:    buffer_store_byte v64, v0, s[0:3], 0 offen offset:157
+; ALIGNED-NEXT:    buffer_store_byte v65, v0, s[0:3], 0 offen offset:156
+; ALIGNED-NEXT:    buffer_store_byte v66, v0, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT:    buffer_store_byte v67, v0, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT:    buffer_store_byte v68, v0, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT:    buffer_store_byte v69, v0, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT:    buffer_store_byte v70, v0, s[0:3], 0 offen offset:151
+; ALIGNED-NEXT:    buffer_store_byte v71, v0, s[0:3], 0 offen offset:150
+; ALIGNED-NEXT:    buffer_store_byte v80, v0, s[0:3], 0 offen offset:149
+; ALIGNED-NEXT:    buffer_store_byte v81, v0, s[0:3], 0 offen offset:148
+; ALIGNED-NEXT:    buffer_store_byte v82, v0, s[0:3], 0 offen offset:147
+; ALIGNED-NEXT:    buffer_store_byte v83, v0, s[0:3], 0 offen offset:146
+; ALIGNED-NEXT:    buffer_store_byte v84, v0, s[0:3], 0 offen offset:145
+; ALIGNED-NEXT:    buffer_store_byte v85, v0, s[0:3], 0 offen offset:144
+; ALIGNED-NEXT:    buffer_store_byte v86, v0, s[0:3], 0 offen offset:143
+; ALIGNED-NEXT:    buffer_store_byte v87, v0, s[0:3], 0 offen offset:142
+; ALIGNED-NEXT:    buffer_store_byte v96, v0, s[0:3], 0 offen offset:141
+; ALIGNED-NEXT:    buffer_store_byte v97, v0, s[0:3], 0 offen offset:140
+; ALIGNED-NEXT:    buffer_store_byte v98, v0, s[0:3], 0 offen offset:139
+; ALIGNED-NEXT:    buffer_store_byte v99, v0, s[0:3], 0 offen offset:138
+; ALIGNED-NEXT:    buffer_store_byte v100, v0, s[0:3], 0 offen offset:137
+; ALIGNED-NEXT:    buffer_store_byte v101, v0, s[0:3], 0 offen offset:136
+; ALIGNED-NEXT:    buffer_store_byte v102, v0, s[0:3], 0 offen offset:135
+; ALIGNED-NEXT:    buffer_store_byte v103, v0, s[0:3], 0 offen offset:134
+; ALIGNED-NEXT:    buffer_store_byte v112, v0, s[0:3], 0 offen offset:133
+; ALIGNED-NEXT:    buffer_store_byte v113, v0, s[0:3], 0 offen offset:132
+; ALIGNED-NEXT:    buffer_store_byte v114, v0, s[0:3], 0 offen offset:131
+; ALIGNED-NEXT:    buffer_store_byte v115, v0, s[0:3], 0 offen offset:130
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:129
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:128
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:127
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:126
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:125
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:124
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:123
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:122
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:121
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:120
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:119
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:118
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:117
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:116
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:115
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:114
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:113
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:112
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:111
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:110
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:109
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:108
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:107
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:106
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:105
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:104
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:103
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:102
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:101
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:100
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:99
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:98
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:97
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:96
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:95
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:94
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:93
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:92
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:91
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:90
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:89
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:88
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:87
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:86
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:83
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:82
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:81
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:80
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:79
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:78
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:77
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:76
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:74
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:73
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:72
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:71
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:70
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:69
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:68
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:67
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:66
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:65
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:64
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:63
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:62
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:61
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:60
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:59
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:55
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:54
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:53
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:52
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:51
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:50
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:49
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:48
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:47
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:46
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:45
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:44
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:43
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:42
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:41
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:40
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:39
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:38
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:37
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:36
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:35
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:34
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:33
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:32
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:31
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:29
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:28
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:27
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:24
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:23
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:22
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:21
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:20
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:18
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:11
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:1
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen
+; ALIGNED-NEXT:    v_add_nc_u32_e32 v0, 0x100, v0
+; ALIGNED-NEXT:    s_cbranch_vccnz .LBB3_1
+; ALIGNED-NEXT:  ; %bb.2: ; %memcpy-split
+; ALIGNED-NEXT:    s_clause 0x2f
+; ALIGNED-NEXT:    buffer_load_dword v127, off, s[0:3], s32
+; ALIGNED-NEXT:    buffer_load_dword v126, off, s[0:3], s32 offset:4
+; ALIGNED-NEXT:    buffer_load_dword v125, off, s[0:3], s32 offset:8
+; ALIGNED-NEXT:    buffer_load_dword v124, off, s[0:3], s32 offset:12
+; ALIGNED-NEXT:    buffer_load_dword v123, off, s[0:3], s32 offset:16
+; ALIGNED-NEXT:    buffer_load_dword v122, off, s[0:3], s32 offset:20
+; ALIGNED-NEXT:    buffer_load_dword v121, off, s[0:3], s32 offset:24
+; ALIGNED-NEXT:    buffer_load_dword v120, off, s[0:3], s32 offset:28
+; ALIGNED-NEXT:    buffer_load_dword v111, off, s[0:3], s32 offset:32
+; ALIGNED-NEXT:    buffer_load_dword v110, off, s[0:3], s32 offset:36
+; ALIGNED-NEXT:    buffer_load_dword v109, off, s[0:3], s32 offset:40
+; ALIGNED-NEXT:    buffer_load_dword v108, off, s[0:3], s32 offset:44
+; ALIGNED-NEXT:    buffer_load_dword v107, off, s[0:3], s32 offset:48
+; ALIGNED-NEXT:    buffer_load_dword v106, off, s[0:3], s32 offset:52
+; ALIGNED-NEXT:    buffer_load_dword v105, off, s[0:3], s32 offset:56
+; ALIGNED-NEXT:    buffer_load_dword v104, off, s[0:3], s32 offset:60
+; ALIGNED-NEXT:    buffer_load_dword v95, off, s[0:3], s32 offset:64
+; ALIGNED-NEXT:    buffer_load_dword v94, off, s[0:3], s32 offset:68
+; ALIGNED-NEXT:    buffer_load_dword v93, off, s[0:3], s32 offset:72
+; ALIGNED-NEXT:    buffer_load_dword v92, off, s[0:3], s32 offset:76
+; ALIGNED-NEXT:    buffer_load_dword v91, off, s[0:3], s32 offset:80
+; ALIGNED-NEXT:    buffer_load_dword v90, off, s[0:3], s32 offset:84
+; ALIGNED-NEXT:    buffer_load_dword v89, off, s[0:3], s32 offset:88
+; ALIGNED-NEXT:    buffer_load_dword v88, off, s[0:3], s32 offset:92
+; ALIGNED-NEXT:    buffer_load_dword v79, off, s[0:3], s32 offset:96
+; ALIGNED-NEXT:    buffer_load_dword v78, off, s[0:3], s32 offset:100
+; ALIGNED-NEXT:    buffer_load_dword v77, off, s[0:3], s32 offset:104
+; ALIGNED-NEXT:    buffer_load_dword v76, off, s[0:3], s32 offset:108
+; ALIGNED-NEXT:    buffer_load_dword v75, off, s[0:3], s32 offset:112
+; ALIGNED-NEXT:    buffer_load_dword v74, off, s[0:3], s32 offset:116
+; ALIGNED-NEXT:    buffer_load_dword v73, off, s[0:3], s32 offset:120
+; ALIGNED-NEXT:    buffer_load_dword v72, off, s[0:3], s32 offset:124
+; ALIGNED-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:128
+; ALIGNED-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:132
+; ALIGNED-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:136
+; ALIGNED-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:140
+; ALIGNED-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:144
+; ALIGNED-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:148
+; ALIGNED-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:152
+; ALIGNED-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:156
+; ALIGNED-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:160
+; ALIGNED-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:164
+; ALIGNED-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:168
+; ALIGNED-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:172
+; ALIGNED-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:176
+; ALIGNED-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:180
+; ALIGNED-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:184
+; ALIGNED-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:188
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNROLL3-LABEL: memcpy_p5_p5_sz2048:
+; UNROLL3:       ; %bb.0: ; %entry
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNROLL3-NEXT:    v_mov_b32_e32 v2, v1
+; UNROLL3-NEXT:    v_mov_b32_e32 v3, v0
+; UNROLL3-NEXT:    s_mov_b64 s[4:5], 0
+; UNROLL3-NEXT:  .LBB3_1: ; %load-store-loop
+; UNROLL3-NEXT:    ; =>This Inner Loop Header: Depth=1
+; UNROLL3-NEXT:    s_clause 0xb
+; UNROLL3-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:44
+; UNROLL3-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:40
+; UNROLL3-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:36
+; UNROLL3-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:32
+; UNROLL3-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:28
+; UNROLL3-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; UNROLL3-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:20
+; UNROLL3-NEXT:    buffer_load_dword v11, v2, s[0:3], 0 offen offset:16
+; UNROLL3-NEXT:    buffer_load_dword v12, v2, s[0:3], 0 offen offset:12
+; UNROLL3-NEXT:    buffer_load_dword v13, v2, s[0:3], 0 offen offset:8
+; UNROLL3-NEXT:    buffer_load_dword v14, v2, s[0:3], 0 offen offset:4
+; UNROLL3-NEXT:    buffer_load_dword v15, v2, s[0:3], 0 offen
+; UNROLL3-NEXT:    s_add_u32 s4, s4, 48
+; UNROLL3-NEXT:    s_addc_u32 s5, s5, 0
+; UNROLL3-NEXT:    v_add_nc_u32_e32 v2, 48, v2
+; UNROLL3-NEXT:    v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5]
+; UNROLL3-NEXT:    s_waitcnt vmcnt(11)
+; UNROLL3-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen offset:44
+; UNROLL3-NEXT:    s_waitcnt vmcnt(10)
+; UNROLL3-NEXT:    buffer_store_dword v5, v3, s[0:3], 0 offen offset:40
+; UNROLL3-NEXT:    s_waitcnt vmcnt(9)
+; UNROLL3-NEXT:    buffer_store_dword v6, v3, s[0:3], 0 offen offset:36
+; UNROLL3-NEXT:    s_waitcnt vmcnt(8)
+; UNROLL3-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen offset:32
+; UNROLL3-NEXT:    s_waitcnt vmcnt(7)
+; UNROLL3-NEXT:    buffer_store_dword v8, v3, s[0:3], 0 offen offset:28
+; UNROLL3-NEXT:    s_waitcnt vmcnt(6)
+; UNROLL3-NEXT:    buffer_store_dword v9, v3, s[0:3], 0 offen offset:24
+; UNROLL3-NEXT:    s_waitcnt vmcnt(5)
+; UNROLL3-NEXT:    buffer_store_dword v10, v3, s[0:3], 0 offen offset:20
+; UNROLL3-NEXT:    s_waitcnt vmcnt(4)
+; UNROLL3-NEXT:    buffer_store_dword v11, v3, s[0:3], 0 offen offset:16
+; UNROLL3-NEXT:    s_waitcnt vmcnt(3)
+; UNROLL3-NEXT:    buffer_store_dword v12, v3, s[0:3], 0 offen offset:12
+; UNROLL3-NEXT:    s_waitcnt vmcnt(2)
+; UNROLL3-NEXT:    buffer_store_dword v13, v3, s[0:3], 0 offen offset:8
+; UNROLL3-NEXT:    s_waitcnt vmcnt(1)
+; UNROLL3-NEXT:    buffer_store_dword v14, v3, s[0:3], 0 offen offset:4
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    buffer_store_dword v15, v3, s[0:3], 0 offen
+; UNROLL3-NEXT:    v_add_nc_u32_e32 v3, 48, v3
+; UNROLL3-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
+; UNROLL3-NEXT:    s_cbranch_vccnz .LBB3_1
+; UNROLL3-NEXT:  ; %bb.2: ; %memcpy-split
+; UNROLL3-NEXT:    s_clause 0x3
+; UNROLL3-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:2028
+; UNROLL3-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:2024
+; UNROLL3-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:2020
+; UNROLL3-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:2016
+; UNROLL3-NEXT:    s_waitcnt vmcnt(3)
+; UNROLL3-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:2028
+; UNROLL3-NEXT:    s_waitcnt vmcnt(2)
+; UNROLL3-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:2024
+; UNROLL3-NEXT:    s_waitcnt vmcnt(1)
+; UNROLL3-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:2020
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:2016
+; UNROLL3-NEXT:    s_clause 0x3
+; UNROLL3-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:2044
+; UNROLL3-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:2040
+; UNROLL3-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:2036
+; UNROLL3-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:2032
+; UNROLL3-NEXT:    s_waitcnt vmcnt(3)
+; UNROLL3-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:2044
+; UNROLL3-NEXT:    s_waitcnt vmcnt(2)
+; UNROLL3-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:2040
+; UNROLL3-NEXT:    s_waitcnt vmcnt(1)
+; UNROLL3-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:2036
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:2032
+; UNROLL3-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 2048, i1 false)
+  ret void
+}
+
+define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p5_sz2048:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:  .LBB4_1: ; %load-store-loop
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_clause 0x3e
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:32
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:36
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:40
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:44
+; CHECK-NEXT:    buffer_load_dword v11, v2, s[0:3], 0 offen offset:48
+; CHECK-NEXT:    buffer_load_dword v12, v2, s[0:3], 0 offen offset:52
+; CHECK-NEXT:    buffer_load_dword v13, v2, s[0:3], 0 offen offset:56
+; CHECK-NEXT:    buffer_load_dword v14, v2, s[0:3], 0 offen offset:60
+; CHECK-NEXT:    buffer_load_dword v18, v2, s[0:3], 0 offen offset:124
+; CHECK-NEXT:    buffer_load_dword v17, v2, s[0:3], 0 offen offset:120
+; CHECK-NEXT:    buffer_load_dword v16, v2, s[0:3], 0 offen offset:116
+; CHECK-NEXT:    buffer_load_dword v15, v2, s[0:3], 0 offen offset:112
+; CHECK-NEXT:    buffer_load_dword v22, v2, s[0:3], 0 offen offset:108
+; CHECK-NEXT:    buffer_load_dword v21, v2, s[0:3], 0 offen offset:104
+; CHECK-NEXT:    buffer_load_dword v20, v2, s[0:3], 0 offen offset:100
+; CHECK-NEXT:    buffer_load_dword v19, v2, s[0:3], 0 offen offset:96
+; CHECK-NEXT:    buffer_load_dword v26, v2, s[0:3], 0 offen offset:252
+; CHECK-NEXT:    buffer_load_dword v25, v2, s[0:3], 0 offen offset:248
+; CHECK-NEXT:    buffer_load_dword v24, v2, s[0:3], 0 offen offset:244
+; CHECK-NEXT:    buffer_load_dword v23, v2, s[0:3], 0 offen offset:240
+; CHECK-NEXT:    buffer_load_dword v30, v2, s[0:3], 0 offen offset:236
+; CHECK-NEXT:    buffer_load_dword v29, v2, s[0:3], 0 offen offset:232
+; CHECK-NEXT:    buffer_load_dword v28, v2, s[0:3], 0 offen offset:228
+; CHECK-NEXT:    buffer_load_dword v27, v2, s[0:3], 0 offen offset:224
+; CHECK-NEXT:    buffer_load_dword v34, v2, s[0:3], 0 offen offset:220
+; CHECK-NEXT:    buffer_load_dword v33, v2, s[0:3], 0 offen offset:216
+; CHECK-NEXT:    buffer_load_dword v32, v2, s[0:3], 0 offen offset:212
+; CHECK-NEXT:    buffer_load_dword v31, v2, s[0:3], 0 offen offset:208
+; CHECK-NEXT:    buffer_load_dword v38, v2, s[0:3], 0 offen offset:204
+; CHECK-NEXT:    buffer_load_dword v37, v2, s[0:3], 0 offen offset:200
+; CHECK-NEXT:    buffer_load_dword v36, v2, s[0:3], 0 offen offset:196
+; CHECK-NEXT:    buffer_load_dword v35, v2, s[0:3], 0 offen offset:192
+; CHECK-NEXT:    buffer_load_dword v51, v2, s[0:3], 0 offen offset:188
+; CHECK-NEXT:    buffer_load_dword v50, v2, s[0:3], 0 offen offset:184
+; CHECK-NEXT:    buffer_load_dword v49, v2, s[0:3], 0 offen offset:180
+; CHECK-NEXT:    buffer_load_dword v48, v2, s[0:3], 0 offen offset:176
+; CHECK-NEXT:    buffer_load_dword v55, v2, s[0:3], 0 offen offset:172
+; CHECK-NEXT:    buffer_load_dword v54, v2, s[0:3], 0 offen offset:168
+; CHECK-NEXT:    buffer_load_dword v53, v2, s[0:3], 0 offen offset:164
+; CHECK-NEXT:    buffer_load_dword v52, v2, s[0:3], 0 offen offset:160
+; CHECK-NEXT:    buffer_load_dword v67, v2, s[0:3], 0 offen offset:156
+; CHECK-NEXT:    buffer_load_dword v66, v2, s[0:3], 0 offen offset:152
+; CHECK-NEXT:    buffer_load_dword v65, v2, s[0:3], 0 offen offset:148
+; CHECK-NEXT:    buffer_load_dword v64, v2, s[0:3], 0 offen offset:144
+; CHECK-NEXT:    buffer_load_dword v71, v2, s[0:3], 0 offen offset:140
+; CHECK-NEXT:    buffer_load_dword v70, v2, s[0:3], 0 offen offset:136
+; CHECK-NEXT:    buffer_load_dword v69, v2, s[0:3], 0 offen offset:132
+; CHECK-NEXT:    buffer_load_dword v68, v2, s[0:3], 0 offen offset:128
+; CHECK-NEXT:    buffer_load_dword v83, v2, s[0:3], 0 offen offset:92
+; CHECK-NEXT:    buffer_load_dword v82, v2, s[0:3], 0 offen offset:88
+; CHECK-NEXT:    buffer_load_dword v81, v2, s[0:3], 0 offen offset:84
+; CHECK-NEXT:    buffer_load_dword v80, v2, s[0:3], 0 offen offset:80
+; CHECK-NEXT:    buffer_load_dword v87, v2, s[0:3], 0 offen offset:76
+; CHECK-NEXT:    buffer_load_dword v86, v2, s[0:3], 0 offen offset:72
+; CHECK-NEXT:    buffer_load_dword v85, v2, s[0:3], 0 offen offset:68
+; CHECK-NEXT:    buffer_load_dword v84, v2, s[0:3], 0 offen offset:64
+; CHECK-NEXT:    buffer_load_dword v96, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v97, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v98, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v99, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    v_add_co_u32 v100, vcc_lo, v0, s4
+; CHECK-NEXT:    s_add_u32 s4, s4, 0x100
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT:    s_addc_u32 s5, s5, 0
+; CHECK-NEXT:    v_add_nc_u32_e32 v2, 0x100, v2
+; CHECK-NEXT:    v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
+; CHECK-NEXT:    s_waitcnt vmcnt(41)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[23:26] offset:240
+; CHECK-NEXT:    s_waitcnt vmcnt(37)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[27:30] offset:224
+; CHECK-NEXT:    s_waitcnt vmcnt(33)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[31:34] offset:208
+; CHECK-NEXT:    s_waitcnt vmcnt(29)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[35:38] offset:192
+; CHECK-NEXT:    s_waitcnt vmcnt(25)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[48:51] offset:176
+; CHECK-NEXT:    s_waitcnt vmcnt(21)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[52:55] offset:160
+; CHECK-NEXT:    s_waitcnt vmcnt(17)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[64:67] offset:144
+; CHECK-NEXT:    s_waitcnt vmcnt(13)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[68:71] offset:128
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[15:18] offset:112
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[19:22] offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(9)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[80:83] offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[84:87] offset:64
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[11:14] offset:48
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[7:10] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[3:6] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[96:99]
+; CHECK-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
+; CHECK-NEXT:    s_cbranch_vccnz .LBB4_1
+; CHECK-NEXT:  ; %bb.2: ; %memcpy-split
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; ALIGNED-LABEL: memcpy_p0_p5_sz2048:
+; ALIGNED:       ; %bb.0: ; %entry
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT:    s_mov_b64 s[4:5], 0
+; ALIGNED-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v72, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v73, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v74, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v75, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v76, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v77, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v78, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v79, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v88, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v89, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v90, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v91, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v92, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v93, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v94, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v95, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v104, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v105, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v106, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v107, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v108, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v109, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v110, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v111, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v120, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v121, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v122, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v123, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v124, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v125, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v126, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill
+; ALIGNED-NEXT:  .LBB4_1: ; %load-store-loop
+; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ALIGNED-NEXT:    s_clause 0x39
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:24
+; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:30
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:31
+; ALIGNED-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32
+; ALIGNED-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33
+; ALIGNED-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34
+; ALIGNED-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
+; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
+; ALIGNED-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35
+; ALIGNED-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36
+; ALIGNED-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37
+; ALIGNED-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:38
+; ALIGNED-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:39
+; ALIGNED-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:40
+; ALIGNED-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:41
+; ALIGNED-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:42
+; ALIGNED-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:43
+; ALIGNED-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:44
+; ALIGNED-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:45
+; ALIGNED-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:46
+; ALIGNED-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:47
+; ALIGNED-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:48
+; ALIGNED-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:49
+; ALIGNED-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:50
+; ALIGNED-NEXT:    buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:51
+; ALIGNED-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:52
+; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:53
+; ALIGNED-NEXT:    buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:54
+; ALIGNED-NEXT:    buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:55
+; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT:    buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT:    buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:60
+; ALIGNED-NEXT:    buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:61
+; ALIGNED-NEXT:    buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:62
+; ALIGNED-NEXT:    buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63
+; ALIGNED-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:64
+; ALIGNED-NEXT:    buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:65
+; ALIGNED-NEXT:    buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:66
+; ALIGNED-NEXT:    buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:59
+; ALIGNED-NEXT:    buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:67
+; ALIGNED-NEXT:    buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:68
+; ALIGNED-NEXT:    buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:69
+; ALIGNED-NEXT:    buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:70
+; ALIGNED-NEXT:    buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71
+; ALIGNED-NEXT:    buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76
+; ALIGNED-NEXT:    buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77
+; ALIGNED-NEXT:    buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78
+; ALIGNED-NEXT:    buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79
+; ALIGNED-NEXT:    buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT:    s_waitcnt vmcnt(57)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(56)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(55)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(54)
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(53)
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(52)
+; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(51)
+; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(50)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(49)
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(48)
+; ALIGNED-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT:    s_waitcnt vmcnt(45)
+; ALIGNED-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(44)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(43)
+; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v9, 8, v5
+; ALIGNED-NEXT:    s_waitcnt vmcnt(41)
+; ALIGNED-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v8, 8, v6
+; ALIGNED-NEXT:    v_lshl_or_b32 v5, v10, 8, v7
+; ALIGNED-NEXT:    v_lshl_or_b32 v6, v11, 8, v12
+; ALIGNED-NEXT:    v_lshl_or_b32 v7, v15, 8, v14
+; ALIGNED-NEXT:    v_lshl_or_b32 v8, v19, 8, v17
+; ALIGNED-NEXT:    s_waitcnt vmcnt(40)
+; ALIGNED-NEXT:    v_lshl_or_b32 v9, v16, 8, v13
+; ALIGNED-NEXT:    s_waitcnt vmcnt(38)
+; ALIGNED-NEXT:    v_lshl_or_b32 v10, v20, 8, v18
+; ALIGNED-NEXT:    s_waitcnt vmcnt(36)
+; ALIGNED-NEXT:    v_lshl_or_b32 v11, v23, 8, v22
+; ALIGNED-NEXT:    s_waitcnt vmcnt(34)
+; ALIGNED-NEXT:    v_lshl_or_b32 v12, v28, 8, v25
+; ALIGNED-NEXT:    s_waitcnt vmcnt(32)
+; ALIGNED-NEXT:    v_lshl_or_b32 v13, v24, 8, v21
+; ALIGNED-NEXT:    s_waitcnt vmcnt(30)
+; ALIGNED-NEXT:    v_lshl_or_b32 v14, v27, 8, v26
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v6, 16, v5
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v8, 16, v7
+; ALIGNED-NEXT:    v_lshl_or_b32 v5, v10, 16, v9
+; ALIGNED-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
+; ALIGNED-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
+; ALIGNED-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(28)
+; ALIGNED-NEXT:    v_lshl_or_b32 v15, v31, 8, v30
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(26)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v34, 8, v33
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(24)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v37, 8, v32
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(22)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v36, 8, v35
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(17)
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v50, 8, v38
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(15)
+; ALIGNED-NEXT:    v_lshl_or_b32 v5, v49, 8, v39
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v6, v51, 8, v48
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(11)
+; ALIGNED-NEXT:    v_lshl_or_b32 v7, v53, 8, v52
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v0, 16, v15
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v5, 16, v4
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v55, 8, v29
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(11)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v67, 8, v66
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(9)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v64, 8, v54
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v68, 8, v65
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82
+; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v70, 8, v69
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v80, 8, v71
+; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73
+; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72
+; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v65, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(8)
+; ALIGNED-NEXT:    buffer_store_dword v127, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v81, 8, v3
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:98
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:102
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:103
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:94
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:95
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:93
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:91
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:92
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v5, 8, v3
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:90
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:101
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:89
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:88
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:99
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:100
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:97
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:96
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:114
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:118
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:119
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:110
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:111
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:109
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:107
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:108
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v5, 8, v3
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:106
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:117
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:105
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:104
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:115
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:116
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:113
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:112
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:130
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:134
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:135
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:126
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:127
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:125
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:123
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:124
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v5, 8, v3
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:122
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:133
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:121
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:120
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:131
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:132
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:129
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:128
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:142
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:143
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:141
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:139
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:140
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v5, 8, v3
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:138
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:137
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:136
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:145
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:144
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x5
+; ALIGNED-NEXT:    buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:146
+; ALIGNED-NEXT:    buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:147
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:148
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:149
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:150
+; ALIGNED-NEXT:    buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:151
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v126, 8, v125
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v123, 8, v5
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v4, 8, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:156
+; ALIGNED-NEXT:    buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:157
+; ALIGNED-NEXT:    buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT:    buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT:    buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v109, 8, v121
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v107, 8, v108
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT:    buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT:    buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v93, 8, v105
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v106, 8, v91
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT:    buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT:    buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT:    buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT:    buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT:    buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT:    buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT:    buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v78, 8, v89
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v74, 8, v73
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v72, 8, v76
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v75, 8, v79
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT:    buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT:    buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT:    buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT:    buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v61, 8, v63
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v60, 8, v62
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT:    buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT:    buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v56, 8, v59
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v57, 8, v47
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT:    buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT:    buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT:    buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT:    buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT:    buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT:    buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT:    buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v43, 8, v46
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v40, 8, v119
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v118, 8, v42
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v41, 8, v45
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT:    buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT:    buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT:    buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT:    buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v115, 8, v117
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v114, 8, v116
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT:    buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT:    buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v103, 8, v113
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v112, 8, v102
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT:    buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT:    buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT:    buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT:    buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT:    buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT:    buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT:    buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v98, 8, v100
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v86, 8, v87
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v85, 8, v96
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v97, 8, v99
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT:    buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT:    buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT:    buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v81, 8, v83
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v80, 8, v82
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT:    buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT:    buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v69, 8, v70
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v71, 8, v68
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT:    buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT:    buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT:    buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v54, 8, v67
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v52, 8, v65
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT:    buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT:    buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT:    buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT:    buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT:    buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT:    buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v53, 8, v66
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v48, 8, v49
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v51, 8, v64
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v39, 8, v50
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT:    buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v36, 8, v38
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v55, 8, v37
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT:    buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v33, 8, v35
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v30, 8, v29
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v31, 8, v34
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v28, 8, v32
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x17
+; ALIGNED-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:247
+; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT:    v_lshl_or_b32 v124, v4, 16, v3
+; ALIGNED-NEXT:    s_clause 0x5
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen
+; ALIGNED-NEXT:    buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT:    buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT:    buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT:    buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT:    buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT:    s_waitcnt vmcnt(28)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v25, 8, v27
+; ALIGNED-NEXT:    s_waitcnt vmcnt(26)
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v24, 8, v26
+; ALIGNED-NEXT:    s_waitcnt vmcnt(14)
+; ALIGNED-NEXT:    v_lshl_or_b32 v44, v12, 8, v16
+; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
+; ALIGNED-NEXT:    v_lshl_or_b32 v58, v8, 8, v10
+; ALIGNED-NEXT:    v_lshl_or_b32 v104, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v21, 8, v22
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v23, 8, v20
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v88, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v90, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v92, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v77, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v17, 8, v19
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v14, 8, v13
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v95, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v101, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v15, 8, v18
+; ALIGNED-NEXT:    v_lshl_or_b32 v84, v44, 16, v4
+; ALIGNED-NEXT:    v_lshl_or_b32 v44, v9, 8, v11
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v58, 16, v44
+; ALIGNED-NEXT:    v_lshl_or_b32 v44, v5, 8, v6
+; ALIGNED-NEXT:    v_lshl_or_b32 v58, v7, 8, v1
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v58, 16, v44
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:1
+; ALIGNED-NEXT:    buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v94, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v44, v44, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v58, v58, 8, v94
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v58, 16, v44
+; ALIGNED-NEXT:    v_lshl_or_b32 v44, v90, 8, v88
+; ALIGNED-NEXT:    v_lshl_or_b32 v58, v95, 8, v92
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v58, 16, v44
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT:    buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT:    buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT:    buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT:    buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:11
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v44, v111, 8, v122
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v58, v110, 8, v120
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v58, 16, v44
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT:    buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT:    buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v44, v92, 8, v95
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v58, v94, 8, v90
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v58, 16, v44
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:16
+; ALIGNED-NEXT:    buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:18
+; ALIGNED-NEXT:    buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:232
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236
+; ALIGNED-NEXT:    buffer_store_dword v84, off, s[0:3], s32 offset:228
+; ALIGNED-NEXT:    buffer_store_dword v101, off, s[0:3], s32 offset:224
+; ALIGNED-NEXT:    v_add_nc_u32_e32 v2, 0x100, v2
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v127, 8, v44
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v127, v58, 8, v88
+; ALIGNED-NEXT:    v_lshl_or_b32 v127, v0, 16, v127
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_add_co_u32 v3, vcc_lo, v0, s4
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_add_u32 s4, s4, 0x100
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, s5, v0, vcc_lo
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v1 offset:250
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v7 offset:251
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v5 offset:249
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v8 offset:255
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v9 offset:253
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v10 offset:254
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v11 offset:252
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v6 offset:248
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v13 offset:242
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v14 offset:243
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v17 offset:241
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v12 offset:247
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v15 offset:245
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v16 offset:246
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v18 offset:244
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v19 offset:240
+; ALIGNED-NEXT:    buffer_store_dword v77, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT:    buffer_store_dword v104, off, s[0:3], s32 offset:252
+; ALIGNED-NEXT:    buffer_store_dword v124, off, s[0:3], s32 offset:244
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_addc_u32 s5, s5, 0
+; ALIGNED-NEXT:    v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
+; ALIGNED-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:240
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v20 offset:234
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v23 offset:235
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v21 offset:233
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v24 offset:239
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v25 offset:237
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v26 offset:238
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v27 offset:236
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v22 offset:232
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v29 offset:226
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v30 offset:227
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v33 offset:225
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v28 offset:231
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v31 offset:229
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v32 offset:230
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v34 offset:228
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v35 offset:224
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:204
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:196
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v54 offset:213
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v52 offset:215
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v36 offset:209
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v55 offset:211
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v37 offset:210
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v65 offset:214
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v67 offset:212
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v49 offset:218
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v48 offset:219
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v53 offset:217
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v39 offset:223
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v51 offset:221
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v50 offset:222
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v64 offset:220
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v66 offset:216
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v38 offset:208
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:212
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v68 offset:202
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v71 offset:203
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v69 offset:201
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v80 offset:207
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v81 offset:205
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v82 offset:206
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v83 offset:204
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v70 offset:200
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v87 offset:194
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v86 offset:195
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v98 offset:193
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v85 offset:199
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v97 offset:197
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v96 offset:198
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v99 offset:196
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v100 offset:192
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:300
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v102 offset:186
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v112 offset:187
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v103 offset:185
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v114 offset:191
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v115 offset:189
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v116 offset:190
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v117 offset:188
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v113 offset:184
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v119 offset:178
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v40 offset:179
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v43 offset:177
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v118 offset:183
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v41 offset:181
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v42 offset:182
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v45 offset:180
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v46 offset:176
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:312
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:316
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:308
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v47 offset:170
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v57 offset:171
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v56 offset:169
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v60 offset:175
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v61 offset:173
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v62 offset:174
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v63 offset:172
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v59 offset:168
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v73 offset:162
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v74 offset:163
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v78 offset:161
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v72 offset:167
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v75 offset:165
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v76 offset:166
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v79 offset:164
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v89 offset:160
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:264
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:260
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v91 offset:154
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v106 offset:155
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v93 offset:153
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v107 offset:159
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v109 offset:157
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v108 offset:158
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v121 offset:156
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v105 offset:152
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v125 offset:146
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v126 offset:147
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:145
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v123 offset:151
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:149
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:150
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:148
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:144
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:284
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:272
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:138
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:139
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:137
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:143
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:141
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:142
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:140
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:136
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:130
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:131
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:129
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:135
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:133
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:134
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:132
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:128
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:360
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:364
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:356
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:352
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:122
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:123
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:121
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:127
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:125
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:126
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:124
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:120
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:114
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:115
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:113
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:119
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:117
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:118
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:116
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:112
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:376
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:380
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:372
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:368
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:106
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:107
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:105
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:111
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:109
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:110
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:108
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:104
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:98
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:99
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:97
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:103
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:101
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:102
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:100
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:96
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:328
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:332
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:324
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:90
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:91
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:89
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:95
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:93
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:94
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:92
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:88
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:82
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:83
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:81
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:87
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:85
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:86
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:84
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:80
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:344
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:348
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:340
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:336
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:74
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:75
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:73
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:79
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:77
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:78
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:76
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:72
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:66
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:67
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:65
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:71
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:69
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:70
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:68
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:64
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:424
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:428
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:420
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:61
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:58
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:59
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:57
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:63
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:62
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:60
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:56
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:53
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:50
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:51
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:49
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:55
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:54
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:52
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:48
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:444
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:440
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:436
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:432
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:43
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:42
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:41
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:40
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:47
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:46
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:45
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:44
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:35
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:34
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:33
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:32
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:39
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:38
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:37
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:36
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:392
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:396
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:388
+; ALIGNED-NEXT:    buffer_store_dword v127, off, s[0:3], s32 offset:384
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:26
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:27
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:25
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:31
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:29
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:30
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:28
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:24
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v44 offset:18
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:19
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v58 offset:17
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:23
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:21
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:22
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:20
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v88 offset:16
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:408
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:412
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:404
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:400
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v90 offset:10
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v94 offset:11
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v111 offset:13
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v92 offset:9
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v110 offset:15
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v120 offset:14
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v122 offset:12
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v95 offset:8
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:2
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:3
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:1
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:7
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:5
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:6
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:4
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0
+; ALIGNED-NEXT:    s_cbranch_vccnz .LBB4_1
+; ALIGNED-NEXT:  ; %bb.2: ; %memcpy-split
+; ALIGNED-NEXT:    s_clause 0x2f
+; ALIGNED-NEXT:    buffer_load_dword v127, off, s[0:3], s32
+; ALIGNED-NEXT:    buffer_load_dword v126, off, s[0:3], s32 offset:4
+; ALIGNED-NEXT:    buffer_load_dword v125, off, s[0:3], s32 offset:8
+; ALIGNED-NEXT:    buffer_load_dword v124, off, s[0:3], s32 offset:12
+; ALIGNED-NEXT:    buffer_load_dword v123, off, s[0:3], s32 offset:16
+; ALIGNED-NEXT:    buffer_load_dword v122, off, s[0:3], s32 offset:20
+; ALIGNED-NEXT:    buffer_load_dword v121, off, s[0:3], s32 offset:24
+; ALIGNED-NEXT:    buffer_load_dword v120, off, s[0:3], s32 offset:28
+; ALIGNED-NEXT:    buffer_load_dword v111, off, s[0:3], s32 offset:32
+; ALIGNED-NEXT:    buffer_load_dword v110, off, s[0:3], s32 offset:36
+; ALIGNED-NEXT:    buffer_load_dword v109, off, s[0:3], s32 offset:40
+; ALIGNED-NEXT:    buffer_load_dword v108, off, s[0:3], s32 offset:44
+; ALIGNED-NEXT:    buffer_load_dword v107, off, s[0:3], s32 offset:48
+; ALIGNED-NEXT:    buffer_load_dword v106, off, s[0:3], s32 offset:52
+; ALIGNED-NEXT:    buffer_load_dword v105, off, s[0:3], s32 offset:56
+; ALIGNED-NEXT:    buffer_load_dword v104, off, s[0:3], s32 offset:60
+; ALIGNED-NEXT:    buffer_load_dword v95, off, s[0:3], s32 offset:64
+; ALIGNED-NEXT:    buffer_load_dword v94, off, s[0:3], s32 offset:68
+; ALIGNED-NEXT:    buffer_load_dword v93, off, s[0:3], s32 offset:72
+; ALIGNED-NEXT:    buffer_load_dword v92, off, s[0:3], s32 offset:76
+; ALIGNED-NEXT:    buffer_load_dword v91, off, s[0:3], s32 offset:80
+; ALIGNED-NEXT:    buffer_load_dword v90, off, s[0:3], s32 offset:84
+; ALIGNED-NEXT:    buffer_load_dword v89, off, s[0:3], s32 offset:88
+; ALIGNED-NEXT:    buffer_load_dword v88, off, s[0:3], s32 offset:92
+; ALIGNED-NEXT:    buffer_load_dword v79, off, s[0:3], s32 offset:96
+; ALIGNED-NEXT:    buffer_load_dword v78, off, s[0:3], s32 offset:100
+; ALIGNED-NEXT:    buffer_load_dword v77, off, s[0:3], s32 offset:104
+; ALIGNED-NEXT:    buffer_load_dword v76, off, s[0:3], s32 offset:108
+; ALIGNED-NEXT:    buffer_load_dword v75, off, s[0:3], s32 offset:112
+; ALIGNED-NEXT:    buffer_load_dword v74, off, s[0:3], s32 offset:116
+; ALIGNED-NEXT:    buffer_load_dword v73, off, s[0:3], s32 offset:120
+; ALIGNED-NEXT:    buffer_load_dword v72, off, s[0:3], s32 offset:124
+; ALIGNED-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:128
+; ALIGNED-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:132
+; ALIGNED-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:136
+; ALIGNED-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:140
+; ALIGNED-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:144
+; ALIGNED-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:148
+; ALIGNED-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:152
+; ALIGNED-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:156
+; ALIGNED-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:160
+; ALIGNED-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:164
+; ALIGNED-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:168
+; ALIGNED-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:172
+; ALIGNED-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:176
+; ALIGNED-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:180
+; ALIGNED-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:184
+; ALIGNED-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:188
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNROLL3-LABEL: memcpy_p0_p5_sz2048:
+; UNROLL3:       ; %bb.0: ; %entry
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNROLL3-NEXT:    v_mov_b32_e32 v3, v2
+; UNROLL3-NEXT:    s_mov_b64 s[4:5], 0
+; UNROLL3-NEXT:    s_inst_prefetch 0x1
+; UNROLL3-NEXT:    .p2align 6
+; UNROLL3-NEXT:  .LBB4_1: ; %load-store-loop
+; UNROLL3-NEXT:    ; =>This Inner Loop Header: Depth=1
+; UNROLL3-NEXT:    s_clause 0xb
+; UNROLL3-NEXT:    buffer_load_dword v4, v3, s[0:3], 0 offen
+; UNROLL3-NEXT:    buffer_load_dword v5, v3, s[0:3], 0 offen offset:4
+; UNROLL3-NEXT:    buffer_load_dword v6, v3, s[0:3], 0 offen offset:8
+; UNROLL3-NEXT:    buffer_load_dword v7, v3, s[0:3], 0 offen offset:12
+; UNROLL3-NEXT:    buffer_load_dword v8, v3, s[0:3], 0 offen offset:16
+; UNROLL3-NEXT:    buffer_load_dword v9, v3, s[0:3], 0 offen offset:20
+; UNROLL3-NEXT:    buffer_load_dword v10, v3, s[0:3], 0 offen offset:24
+; UNROLL3-NEXT:    buffer_load_dword v11, v3, s[0:3], 0 offen offset:28
+; UNROLL3-NEXT:    buffer_load_dword v12, v3, s[0:3], 0 offen offset:32
+; UNROLL3-NEXT:    buffer_load_dword v13, v3, s[0:3], 0 offen offset:36
+; UNROLL3-NEXT:    buffer_load_dword v14, v3, s[0:3], 0 offen offset:40
+; UNROLL3-NEXT:    buffer_load_dword v15, v3, s[0:3], 0 offen offset:44
+; UNROLL3-NEXT:    v_add_co_u32 v16, vcc_lo, v0, s4
+; UNROLL3-NEXT:    s_add_u32 s4, s4, 48
+; UNROLL3-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; UNROLL3-NEXT:    s_addc_u32 s5, s5, 0
+; UNROLL3-NEXT:    v_add_nc_u32_e32 v3, 48, v3
+; UNROLL3-NEXT:    v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5]
+; UNROLL3-NEXT:    s_waitcnt vmcnt(4)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[8:11] offset:16
+; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[4:7]
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[12:15] offset:32
+; UNROLL3-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
+; UNROLL3-NEXT:    s_cbranch_vccnz .LBB4_1
+; UNROLL3-NEXT:  ; %bb.2: ; %memcpy-split
+; UNROLL3-NEXT:    s_inst_prefetch 0x2
+; UNROLL3-NEXT:    s_clause 0x3
+; UNROLL3-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:2016
+; UNROLL3-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:2020
+; UNROLL3-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:2024
+; UNROLL3-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:2028
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:2016
+; UNROLL3-NEXT:    s_clause 0x3
+; UNROLL3-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:2032
+; UNROLL3-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:2036
+; UNROLL3-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:2040
+; UNROLL3-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:2044
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:2032
+; UNROLL3-NEXT:    s_waitcnt lgkmcnt(0)
+; UNROLL3-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 2048, i1 false)
+  ret void
+}
+
+
+; memmove for address spaces 0, 1, 4, 5
+
+define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p0_p0_sz2048:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s4, exec_lo
+; CHECK-NEXT:    v_cmpx_ge_u64_e64 v[2:3], v[0:1]
+; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execz .LBB5_3
+; CHECK-NEXT:  ; %bb.1: ; %memmove_fwd_loop.preheader
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:  .LBB5_2: ; %memmove_fwd_loop
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    v_add_co_u32 v96, vcc_lo, v2, s4
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v100, vcc_lo, v0, s4
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT:    s_clause 0xf
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[96:97] offset:224
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[96:97] offset:240
+; CHECK-NEXT:    flat_load_dwordx4 v[12:15], v[96:97] offset:192
+; CHECK-NEXT:    flat_load_dwordx4 v[16:19], v[96:97] offset:208
+; CHECK-NEXT:    flat_load_dwordx4 v[20:23], v[96:97] offset:160
+; CHECK-NEXT:    flat_load_dwordx4 v[24:27], v[96:97] offset:176
+; CHECK-NEXT:    flat_load_dwordx4 v[28:31], v[96:97] offset:128
+; CHECK-NEXT:    flat_load_dwordx4 v[32:35], v[96:97] offset:144
+; CHECK-NEXT:    flat_load_dwordx4 v[36:39], v[96:97] offset:96
+; CHECK-NEXT:    flat_load_dwordx4 v[48:51], v[96:97] offset:112
+; CHECK-NEXT:    flat_load_dwordx4 v[52:55], v[96:97] offset:64
+; CHECK-NEXT:    flat_load_dwordx4 v[64:67], v[96:97] offset:80
+; CHECK-NEXT:    flat_load_dwordx4 v[68:71], v[96:97] offset:32
+; CHECK-NEXT:    flat_load_dwordx4 v[80:83], v[96:97] offset:48
+; CHECK-NEXT:    flat_load_dwordx4 v[84:87], v[96:97]
+; CHECK-NEXT:    flat_load_dwordx4 v[96:99], v[96:97] offset:16
+; CHECK-NEXT:    s_add_u32 s4, s4, 0x100
+; CHECK-NEXT:    s_addc_u32 s5, s5, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[4:7] offset:224
+; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[8:11] offset:240
+; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[12:15] offset:192
+; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[16:19] offset:208
+; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[20:23] offset:160
+; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[24:27] offset:176
+; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[28:31] offset:128
+; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[32:35] offset:144
+; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[36:39] offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[48:51] offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[52:55] offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[64:67] offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[68:71] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[80:83] offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[84:87]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[96:99] offset:16
+; CHECK-NEXT:    s_cmp_lg_u64 s[4:5], 0x800
+; CHECK-NEXT:    s_cbranch_scc1 .LBB5_2
+; CHECK-NEXT:  .LBB5_3: ; %Flow5
+; CHECK-NEXT:    s_andn2_saveexec_b32 s8, s6
+; CHECK-NEXT:    s_cbranch_execz .LBB5_6
+; CHECK-NEXT:  ; %bb.4: ; %memmove_bwd_loop.preheader
+; CHECK-NEXT:    s_movk_i32 s6, 0xff00
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0x700
+; CHECK-NEXT:    s_mov_b32 s7, -1
+; CHECK-NEXT:  .LBB5_5: ; %memmove_bwd_loop
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    v_add_co_u32 v96, vcc_lo, v2, s4
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v100, vcc_lo, v0, s4
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT:    s_clause 0xf
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[96:97] offset:224
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[96:97] offset:240
+; CHECK-NEXT:    flat_load_dwordx4 v[12:15], v[96:97] offset:192
+; CHECK-NEXT:    flat_load_dwordx4 v[16:19], v[96:97] offset:208
+; CHECK-NEXT:    flat_load_dwordx4 v[20:23], v[96:97] offset:160
+; CHECK-NEXT:    flat_load_dwordx4 v[24:27], v[96:97] offset:176
+; CHECK-NEXT:    flat_load_dwordx4 v[28:31], v[96:97] offset:128
+; CHECK-NEXT:    flat_load_dwordx4 v[32:35], v[96:97] offset:144
+; CHECK-NEXT:    flat_load_dwordx4 v[36:39], v[96:97] offset:96
+; CHECK-NEXT:    flat_load_dwordx4 v[48:51], v[96:97] offset:112
+; CHECK-NEXT:    flat_load_dwordx4 v[52:55], v[96:97] offset:64
+; CHECK-NEXT:    flat_load_dwordx4 v[64:67], v[96:97] offset:80
+; CHECK-NEXT:    flat_load_dwordx4 v[68:71], v[96:97] offset:32
+; CHECK-NEXT:    flat_load_dwordx4 v[80:83], v[96:97] offset:48
+; CHECK-NEXT:    flat_load_dwordx4 v[84:87], v[96:97]
+; CHECK-NEXT:    flat_load_dwordx4 v[96:99], v[96:97] offset:16
+; CHECK-NEXT:    s_add_u32 s4, s4, 0xffffff00
+; CHECK-NEXT:    s_addc_u32 s5, s5, -1
+; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[4:7] offset:224
+; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[8:11] offset:240
+; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[12:15] offset:192
+; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[16:19] offset:208
+; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[20:23] offset:160
+; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[24:27] offset:176
+; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[28:31] offset:128
+; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[32:35] offset:144
+; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[36:39] offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[48:51] offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[52:55] offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[64:67] offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[68:71] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[80:83] offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[84:87]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[96:99] offset:16
+; CHECK-NEXT:    s_cmp_eq_u64 s[4:5], s[6:7]
+; CHECK-NEXT:    s_cbranch_scc0 .LBB5_5
+; CHECK-NEXT:  .LBB5_6: ; %Flow6
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; ALIGNED-LABEL: memmove_p0_p0_sz2048:
+; ALIGNED:       ; %bb.0: ; %entry
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_mov_b32 s4, exec_lo
+; ALIGNED-NEXT:    v_cmpx_ge_u64_e64 v[2:3], v[0:1]
+; ALIGNED-NEXT:    s_xor_b32 s6, exec_lo, s4
+; ALIGNED-NEXT:    s_cbranch_execz .LBB5_3
+; ALIGNED-NEXT:  ; %bb.1: ; %memmove_fwd_loop.preheader
+; ALIGNED-NEXT:    s_mov_b64 s[4:5], 0
+; ALIGNED-NEXT:  .LBB5_2: ; %memmove_fwd_loop
+; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ALIGNED-NEXT:    v_add_co_u32 v20, vcc_lo, v2, s4
+; ALIGNED-NEXT:    v_add_co_ci_u32_e32 v21, vcc_lo, s5, v3, vcc_lo
+; ALIGNED-NEXT:    s_clause 0xf
+; ALIGNED-NEXT:    flat_load_dwordx4 v[16:19], v[20:21] offset:240
+; ALIGNED-NEXT:    flat_load_dwordx4 v[22:25], v[20:21] offset:224
+; ALIGNED-NEXT:    flat_load_dwordx4 v[4:7], v[20:21]
+; ALIGNED-NEXT:    flat_load_dwordx4 v[8:11], v[20:21] offset:16
+; ALIGNED-NEXT:    flat_load_dwordx4 v[12:15], v[20:21] offset:32
+; ALIGNED-NEXT:    flat_load_dwordx4 v[98:101], v[20:21] offset:48
+; ALIGNED-NEXT:    flat_load_dwordx4 v[112:115], v[20:21] offset:64
+; ALIGNED-NEXT:    flat_load_dwordx4 v[82:85], v[20:21] offset:80
+; ALIGNED-NEXT:    flat_load_dwordx4 v[116:119], v[20:21] offset:96
+; ALIGNED-NEXT:    flat_load_dwordx4 v[66:69], v[20:21] offset:112
+; ALIGNED-NEXT:    flat_load_dwordx4 v[40:43], v[20:21] offset:128
+; ALIGNED-NEXT:    flat_load_dwordx4 v[50:53], v[20:21] offset:144
+; ALIGNED-NEXT:    flat_load_dwordx4 v[44:47], v[20:21] offset:160
+; ALIGNED-NEXT:    flat_load_dwordx4 v[34:37], v[20:21] offset:176
+; ALIGNED-NEXT:    flat_load_dwordx4 v[30:33], v[20:21] offset:192
+; ALIGNED-NEXT:    flat_load_dwordx4 v[26:29], v[20:21] offset:208
+; ALIGNED-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
+; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:64
+; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:68
+; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:72
+; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:76
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:76
+; ALIGNED-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:72
+; ALIGNED-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:68
+; ALIGNED-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64
+; ALIGNED-NEXT:    v_add_co_u32 v16, vcc_lo, v0, s4
+; ALIGNED-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; ALIGNED-NEXT:    s_add_u32 s4, s4, 0x100
+; ALIGNED-NEXT:    s_addc_u32 s5, s5, 0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v21 offset:254
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v21 offset:252
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v20 offset:250
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v20 offset:248
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v19 offset:246
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v19 offset:244
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v18 offset:242
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v18 offset:240
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(22)
+; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:80
+; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:84
+; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:88
+; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:92
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:92
+; ALIGNED-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:88
+; ALIGNED-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:84
+; ALIGNED-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:80
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v21
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v21, 8, v21
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v20
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v20, 8, v20
+; ALIGNED-NEXT:    s_cmp_lg_u64 s[4:5], 0x800
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v25 offset:238
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v25 offset:236
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v24 offset:234
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v24 offset:232
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v23 offset:230
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v23 offset:228
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v22 offset:226
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v22 offset:224
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(16)
+; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:32
+; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:36
+; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:40
+; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:44
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:44
+; ALIGNED-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:40
+; ALIGNED-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:36
+; ALIGNED-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:32
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v29 offset:222
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v29 offset:220
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v28 offset:218
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v28 offset:216
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v27 offset:214
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v27 offset:212
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v26 offset:210
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v26 offset:208
+; ALIGNED-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:48
+; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:52
+; ALIGNED-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:56
+; ALIGNED-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:60
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
+; ALIGNED-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; ALIGNED-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
+; ALIGNED-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:48
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v33 offset:206
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v33 offset:204
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v32 offset:202
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v32 offset:200
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v31 offset:198
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v31 offset:196
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v30 offset:194
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v30 offset:192
+; ALIGNED-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:128
+; ALIGNED-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:132
+; ALIGNED-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:136
+; ALIGNED-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:140
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:140
+; ALIGNED-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:136
+; ALIGNED-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:132
+; ALIGNED-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:128
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v37 offset:190
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v37 offset:188
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v36 offset:186
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v36 offset:184
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v35 offset:182
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v35 offset:180
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v34 offset:178
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v34 offset:176
+; ALIGNED-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:144
+; ALIGNED-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:148
+; ALIGNED-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:152
+; ALIGNED-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:156
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:156
+; ALIGNED-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:152
+; ALIGNED-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:148
+; ALIGNED-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:144
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v49 offset:174
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v49 offset:172
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v48 offset:170
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v48 offset:168
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v39 offset:166
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v39 offset:164
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v38 offset:162
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v38 offset:160
+; ALIGNED-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:96
+; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:100
+; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:104
+; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:108
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:108
+; ALIGNED-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:104
+; ALIGNED-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:100
+; ALIGNED-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:96
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v53 offset:158
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v53 offset:156
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v52 offset:154
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v52 offset:152
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v51 offset:150
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v51 offset:148
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v50 offset:146
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v50 offset:144
+; ALIGNED-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:112
+; ALIGNED-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:116
+; ALIGNED-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:120
+; ALIGNED-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:124
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v65, off, s[0:3], s32 offset:124
+; ALIGNED-NEXT:    buffer_load_dword v64, off, s[0:3], s32 offset:120
+; ALIGNED-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:116
+; ALIGNED-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:112
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v65 offset:142
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v65 offset:140
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v64 offset:138
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v64 offset:136
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v55 offset:134
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v55 offset:132
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v54 offset:130
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v54 offset:128
+; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:196
+; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:204
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v69, off, s[0:3], s32 offset:204
+; ALIGNED-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT:    buffer_load_dword v67, off, s[0:3], s32 offset:196
+; ALIGNED-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v69 offset:126
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v69 offset:124
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v68 offset:122
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v68 offset:120
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v67 offset:118
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v67 offset:116
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v66 offset:114
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v66 offset:112
+; ALIGNED-NEXT:    buffer_store_dword v116, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT:    buffer_store_dword v117, off, s[0:3], s32 offset:212
+; ALIGNED-NEXT:    buffer_store_dword v118, off, s[0:3], s32 offset:216
+; ALIGNED-NEXT:    buffer_store_dword v119, off, s[0:3], s32 offset:220
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v81, off, s[0:3], s32 offset:220
+; ALIGNED-NEXT:    buffer_load_dword v80, off, s[0:3], s32 offset:216
+; ALIGNED-NEXT:    buffer_load_dword v71, off, s[0:3], s32 offset:212
+; ALIGNED-NEXT:    buffer_load_dword v70, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v81 offset:110
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v81 offset:108
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v80 offset:106
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v80 offset:104
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v71 offset:102
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v71 offset:100
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v70 offset:98
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v70 offset:96
+; ALIGNED-NEXT:    buffer_store_dword v82, off, s[0:3], s32 offset:160
+; ALIGNED-NEXT:    buffer_store_dword v83, off, s[0:3], s32 offset:164
+; ALIGNED-NEXT:    buffer_store_dword v84, off, s[0:3], s32 offset:168
+; ALIGNED-NEXT:    buffer_store_dword v85, off, s[0:3], s32 offset:172
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v85, off, s[0:3], s32 offset:172
+; ALIGNED-NEXT:    buffer_load_dword v84, off, s[0:3], s32 offset:168
+; ALIGNED-NEXT:    buffer_load_dword v83, off, s[0:3], s32 offset:164
+; ALIGNED-NEXT:    buffer_load_dword v82, off, s[0:3], s32 offset:160
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v85 offset:94
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v85 offset:92
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v84 offset:90
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v84 offset:88
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v83 offset:86
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v83 offset:84
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v82 offset:82
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v82 offset:80
+; ALIGNED-NEXT:    buffer_store_dword v112, off, s[0:3], s32 offset:176
+; ALIGNED-NEXT:    buffer_store_dword v113, off, s[0:3], s32 offset:180
+; ALIGNED-NEXT:    buffer_store_dword v114, off, s[0:3], s32 offset:184
+; ALIGNED-NEXT:    buffer_store_dword v115, off, s[0:3], s32 offset:188
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v97, off, s[0:3], s32 offset:188
+; ALIGNED-NEXT:    buffer_load_dword v96, off, s[0:3], s32 offset:184
+; ALIGNED-NEXT:    buffer_load_dword v87, off, s[0:3], s32 offset:180
+; ALIGNED-NEXT:    buffer_load_dword v86, off, s[0:3], s32 offset:176
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 24, v19
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v19, 8, v19
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v18
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v18, 8, v18
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v25
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v25, 8, v25
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 24, v24
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v24, 8, v24
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v97 offset:78
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v97 offset:76
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v96 offset:74
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v96 offset:72
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v87 offset:70
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v87 offset:68
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v86 offset:66
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v86 offset:64
+; ALIGNED-NEXT:    buffer_store_dword v98, off, s[0:3], s32 offset:256
+; ALIGNED-NEXT:    buffer_store_dword v99, off, s[0:3], s32 offset:260
+; ALIGNED-NEXT:    buffer_store_dword v100, off, s[0:3], s32 offset:264
+; ALIGNED-NEXT:    buffer_store_dword v101, off, s[0:3], s32 offset:268
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v101, off, s[0:3], s32 offset:268
+; ALIGNED-NEXT:    buffer_load_dword v100, off, s[0:3], s32 offset:264
+; ALIGNED-NEXT:    buffer_load_dword v99, off, s[0:3], s32 offset:260
+; ALIGNED-NEXT:    buffer_load_dword v98, off, s[0:3], s32 offset:256
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v101 offset:62
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v101 offset:60
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v100 offset:58
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v100 offset:56
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v99 offset:54
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v99 offset:52
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v98 offset:50
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v98 offset:48
+; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:272
+; ALIGNED-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:280
+; ALIGNED-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:284
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:280
+; ALIGNED-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:284
+; ALIGNED-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:272
+; ALIGNED-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v15 offset:42
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v15 offset:40
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v14 offset:46
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v14 offset:44
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v13 offset:34
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v13 offset:32
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v12 offset:38
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v12 offset:36
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224
+; ALIGNED-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:228
+; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:232
+; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:236
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:236
+; ALIGNED-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:232
+; ALIGNED-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:228
+; ALIGNED-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:224
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v11 offset:30
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v11 offset:28
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v10 offset:26
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v10 offset:24
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v9 offset:22
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v9 offset:20
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v8 offset:18
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v8 offset:16
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:240
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:244
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:252
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:252
+; ALIGNED-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:244
+; ALIGNED-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:240
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v112 offset:247
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 24, v27
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v102 offset:255
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v23
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v23, 8, v23
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v21 offset:253
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v21, 24, v22
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v22, 8, v22
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v103 offset:251
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v29
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v29, 8, v29
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v20 offset:249
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v20, 24, v28
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v28, 8, v28
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v27, 8, v27
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v19 offset:245
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v19, 24, v26
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v26, 8, v26
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v112 offset:215
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 24, v51
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 8, v51
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v113 offset:243
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v33
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v33, 8, v33
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v18 offset:241
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v18, 24, v32
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v32, 8, v32
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v114 offset:239
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v31
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v31, 8, v31
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v25 offset:237
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v25, 24, v30
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v30, 8, v30
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v115 offset:235
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 24, v37
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v37, 8, v37
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v24 offset:233
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v24, 24, v36
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v36, 8, v36
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v102 offset:231
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v35
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v35, 8, v35
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v23 offset:229
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v23, 24, v34
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v34, 8, v34
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v21 offset:227
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v21, 24, v49
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v49, 8, v49
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v22 offset:225
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v22, 24, v48
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v48, 8, v48
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v103 offset:223
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v39
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v39, 8, v39
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v29 offset:221
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v29, 24, v38
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v38, 8, v38
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v20 offset:219
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v20, 24, v53
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v53, 8, v53
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v28 offset:217
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v28, 24, v52
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v52, 8, v52
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v27 offset:213
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v27, 24, v50
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v50, 8, v50
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v19 offset:211
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v19, 24, v65
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v65, 8, v65
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v26 offset:209
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v26, 24, v64
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v51 offset:149
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 24, v8
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v8, 8, v8
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v64, 8, v64
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v113 offset:207
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v55
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v55, 8, v55
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v33 offset:205
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v33, 24, v54
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v54, 8, v54
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v18 offset:203
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v18, 24, v69
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v69, 8, v69
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v32 offset:201
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v32, 24, v68
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v68, 8, v68
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v114 offset:199
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v67
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v67, 8, v67
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v31 offset:197
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v31, 24, v66
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v66, 8, v66
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v25 offset:195
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v25, 24, v81
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v81, 8, v81
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v30 offset:193
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v30, 24, v80
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v80, 8, v80
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v115 offset:191
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 24, v71
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 8, v71
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v37 offset:189
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v37, 24, v70
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 8, v70
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v24 offset:187
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v24, 24, v85
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v85, 8, v85
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v36 offset:185
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v36, 24, v84
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v84, 8, v84
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v102 offset:183
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v83
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v83, 8, v83
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v35 offset:181
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v35, 24, v82
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 8, v82
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v23 offset:179
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v23, 24, v97
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v97, 8, v97
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v34 offset:177
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v34, 24, v96
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v96, 8, v96
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v21 offset:175
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v21, 24, v87
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v87, 8, v87
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v49 offset:173
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v49, 24, v86
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 8, v86
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v22 offset:171
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v22, 24, v101
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 8, v101
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v48 offset:169
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v48, 24, v100
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 8, v100
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v103 offset:167
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v99
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 8, v99
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v39 offset:165
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v39, 24, v98
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 8, v98
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v29 offset:163
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v29, 24, v15
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v15, 8, v15
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v38 offset:161
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v38, 24, v14
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v14, 8, v14
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v20 offset:159
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v20, 24, v13
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v13, 8, v13
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v53 offset:157
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v53, 24, v12
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v12, 8, v12
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v28 offset:155
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v28, 24, v11
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v11, 8, v11
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v52 offset:153
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v52, 24, v10
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v10, 8, v10
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v112 offset:151
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 24, v9
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v9, 8, v9
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v27 offset:147
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v50 offset:145
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v19 offset:143
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v65 offset:141
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v26 offset:139
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v64 offset:137
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v113 offset:135
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v55 offset:133
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v33 offset:131
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v54 offset:129
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v18 offset:127
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v69 offset:125
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v32 offset:123
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v68 offset:121
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v114 offset:119
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v67 offset:117
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v31 offset:115
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v66 offset:113
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v25 offset:111
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v81 offset:109
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v30 offset:107
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v80 offset:105
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v115 offset:103
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v71 offset:101
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v37 offset:99
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v70 offset:97
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v24 offset:95
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v85 offset:93
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v36 offset:91
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v84 offset:89
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v102 offset:87
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v83 offset:85
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v35 offset:83
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v82 offset:81
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v23 offset:79
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v97 offset:77
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v34 offset:75
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v96 offset:73
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v21 offset:71
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v87 offset:69
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v49 offset:67
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v86 offset:65
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v22 offset:63
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v101 offset:61
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v48 offset:59
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v100 offset:57
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v103 offset:55
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v99 offset:53
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v39 offset:51
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v98 offset:49
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v29 offset:43
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v15 offset:41
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v38 offset:47
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v14 offset:45
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v20 offset:35
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v13 offset:33
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v53 offset:39
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v12 offset:37
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v28 offset:31
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v11 offset:29
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v52 offset:27
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v10 offset:25
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v112 offset:23
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v9 offset:21
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v51 offset:19
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v8 offset:17
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v7 offset:14
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v7 offset:12
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v6 offset:10
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v6 offset:8
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v5 offset:6
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v5 offset:4
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v4 offset:2
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v4
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v8, 24, v7
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v9, 24, v6
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v8 offset:15
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v7 offset:13
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v9 offset:11
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v6 offset:9
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v10 offset:7
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v5 offset:5
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v11 offset:3
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v4 offset:1
+; ALIGNED-NEXT:    s_cbranch_scc1 .LBB5_2
+; ALIGNED-NEXT:  .LBB5_3: ; %Flow5
+; ALIGNED-NEXT:    s_andn2_saveexec_b32 s8, s6
+; ALIGNED-NEXT:    s_cbranch_execz .LBB5_6
+; ALIGNED-NEXT:  ; %bb.4: ; %memmove_bwd_loop.preheader
+; ALIGNED-NEXT:    s_movk_i32 s6, 0xff00
+; ALIGNED-NEXT:    s_mov_b64 s[4:5], 0x700
+; ALIGNED-NEXT:    s_mov_b32 s7, -1
+; ALIGNED-NEXT:  .LBB5_5: ; %memmove_bwd_loop
+; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ALIGNED-NEXT:    v_add_co_u32 v24, vcc_lo, v2, s4
+; ALIGNED-NEXT:    v_add_co_ci_u32_e32 v25, vcc_lo, s5, v3, vcc_lo
+; ALIGNED-NEXT:    s_clause 0xf
+; ALIGNED-NEXT:    flat_load_dwordx4 v[16:19], v[24:25] offset:240
+; ALIGNED-NEXT:    flat_load_dwordx4 v[20:23], v[24:25] offset:224
+; ALIGNED-NEXT:    flat_load_dwordx4 v[4:7], v[24:25]
+; ALIGNED-NEXT:    flat_load_dwordx4 v[8:11], v[24:25] offset:16
+; ALIGNED-NEXT:    flat_load_dwordx4 v[12:15], v[24:25] offset:32
+; ALIGNED-NEXT:    flat_load_dwordx4 v[112:115], v[24:25] offset:48
+; ALIGNED-NEXT:    flat_load_dwordx4 v[116:119], v[24:25] offset:64
+; ALIGNED-NEXT:    flat_load_dwordx4 v[40:43], v[24:25] offset:80
+; ALIGNED-NEXT:    flat_load_dwordx4 v[26:29], v[24:25] offset:96
+; ALIGNED-NEXT:    flat_load_dwordx4 v[32:35], v[24:25] offset:112
+; ALIGNED-NEXT:    flat_load_dwordx4 v[44:47], v[24:25] offset:128
+; ALIGNED-NEXT:    flat_load_dwordx4 v[52:55], v[24:25] offset:144
+; ALIGNED-NEXT:    flat_load_dwordx4 v[66:69], v[24:25] offset:160
+; ALIGNED-NEXT:    flat_load_dwordx4 v[81:84], v[24:25] offset:176
+; ALIGNED-NEXT:    flat_load_dwordx4 v[96:99], v[24:25] offset:192
+; ALIGNED-NEXT:    flat_load_dwordx4 v[100:103], v[24:25] offset:208
+; ALIGNED-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
+; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:320
+; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:324
+; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:328
+; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:332
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:332
+; ALIGNED-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:328
+; ALIGNED-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:324
+; ALIGNED-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:320
+; ALIGNED-NEXT:    v_add_co_u32 v16, vcc_lo, v0, s4
+; ALIGNED-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; ALIGNED-NEXT:    s_add_u32 s4, s4, 0xffffff00
+; ALIGNED-NEXT:    s_addc_u32 s5, s5, -1
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v31 offset:254
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v31 offset:252
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v30 offset:250
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v30 offset:248
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v25 offset:246
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v25 offset:244
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v24 offset:242
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v24 offset:240
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(22)
+; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:336
+; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:340
+; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:344
+; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:348
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:348
+; ALIGNED-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:344
+; ALIGNED-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:340
+; ALIGNED-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:336
+; ALIGNED-NEXT:    s_cmp_eq_u64 s[4:5], s[6:7]
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v51 offset:238
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v51 offset:236
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v50 offset:234
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v50 offset:232
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v49 offset:230
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v49 offset:228
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v36 offset:226
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v36 offset:224
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(16)
+; ALIGNED-NEXT:    buffer_store_dword v100, off, s[0:3], s32 offset:288
+; ALIGNED-NEXT:    buffer_store_dword v101, off, s[0:3], s32 offset:292
+; ALIGNED-NEXT:    buffer_store_dword v102, off, s[0:3], s32 offset:296
+; ALIGNED-NEXT:    buffer_store_dword v103, off, s[0:3], s32 offset:300
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v71, off, s[0:3], s32 offset:300
+; ALIGNED-NEXT:    buffer_load_dword v70, off, s[0:3], s32 offset:296
+; ALIGNED-NEXT:    buffer_load_dword v65, off, s[0:3], s32 offset:292
+; ALIGNED-NEXT:    buffer_load_dword v64, off, s[0:3], s32 offset:288
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v31
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v31, 8, v31
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v30
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v30, 8, v30
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v71 offset:222
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v71 offset:220
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v70 offset:218
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v70 offset:216
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v65 offset:214
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v65 offset:212
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v64 offset:210
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v64 offset:208
+; ALIGNED-NEXT:    buffer_store_dword v96, off, s[0:3], s32 offset:304
+; ALIGNED-NEXT:    buffer_store_dword v97, off, s[0:3], s32 offset:308
+; ALIGNED-NEXT:    buffer_store_dword v98, off, s[0:3], s32 offset:312
+; ALIGNED-NEXT:    buffer_store_dword v99, off, s[0:3], s32 offset:316
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v87, off, s[0:3], s32 offset:316
+; ALIGNED-NEXT:    buffer_load_dword v86, off, s[0:3], s32 offset:312
+; ALIGNED-NEXT:    buffer_load_dword v85, off, s[0:3], s32 offset:308
+; ALIGNED-NEXT:    buffer_load_dword v80, off, s[0:3], s32 offset:304
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v87 offset:206
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v87 offset:204
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v86 offset:202
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v86 offset:200
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v85 offset:198
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v85 offset:196
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v80 offset:194
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v80 offset:192
+; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:384
+; ALIGNED-NEXT:    buffer_store_dword v82, off, s[0:3], s32 offset:388
+; ALIGNED-NEXT:    buffer_store_dword v83, off, s[0:3], s32 offset:392
+; ALIGNED-NEXT:    buffer_store_dword v84, off, s[0:3], s32 offset:396
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v101, off, s[0:3], s32 offset:396
+; ALIGNED-NEXT:    buffer_load_dword v99, off, s[0:3], s32 offset:392
+; ALIGNED-NEXT:    buffer_load_dword v96, off, s[0:3], s32 offset:388
+; ALIGNED-NEXT:    buffer_load_dword v81, off, s[0:3], s32 offset:384
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v101 offset:190
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v101 offset:188
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v99 offset:186
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v99 offset:184
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v96 offset:182
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v96 offset:180
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v81 offset:178
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v81 offset:176
+; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:400
+; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:404
+; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:408
+; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:412
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v100, off, s[0:3], s32 offset:412
+; ALIGNED-NEXT:    buffer_load_dword v97, off, s[0:3], s32 offset:408
+; ALIGNED-NEXT:    buffer_load_dword v82, off, s[0:3], s32 offset:404
+; ALIGNED-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:400
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v100 offset:174
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v100 offset:172
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v97 offset:170
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v97 offset:168
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v82 offset:166
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v82 offset:164
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v66 offset:162
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v66 offset:160
+; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:352
+; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:356
+; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:360
+; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:364
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v98, off, s[0:3], s32 offset:364
+; ALIGNED-NEXT:    buffer_load_dword v83, off, s[0:3], s32 offset:360
+; ALIGNED-NEXT:    buffer_load_dword v67, off, s[0:3], s32 offset:356
+; ALIGNED-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:352
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v98 offset:158
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v98 offset:156
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v83 offset:154
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v83 offset:152
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v67 offset:150
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v67 offset:148
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v52 offset:146
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v52 offset:144
+; ALIGNED-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:368
+; ALIGNED-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:372
+; ALIGNED-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:376
+; ALIGNED-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:380
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v84, off, s[0:3], s32 offset:380
+; ALIGNED-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:376
+; ALIGNED-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:372
+; ALIGNED-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:368
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v84 offset:142
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v84 offset:140
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v68 offset:138
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v68 offset:136
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v53 offset:134
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v53 offset:132
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v37 offset:130
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v37 offset:128
+; ALIGNED-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:448
+; ALIGNED-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:452
+; ALIGNED-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:456
+; ALIGNED-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:460
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v69, off, s[0:3], s32 offset:460
+; ALIGNED-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:456
+; ALIGNED-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:452
+; ALIGNED-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:448
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v69 offset:126
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v69 offset:124
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v54 offset:122
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v54 offset:120
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v38 offset:118
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v38 offset:116
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v32 offset:114
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v32 offset:112
+; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:464
+; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:468
+; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:472
+; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:476
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:476
+; ALIGNED-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:472
+; ALIGNED-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:468
+; ALIGNED-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:464
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v55 offset:110
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v55 offset:108
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v39 offset:106
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v39 offset:104
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v33 offset:102
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v33 offset:100
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v26 offset:98
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v26 offset:96
+; ALIGNED-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:416
+; ALIGNED-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:420
+; ALIGNED-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:424
+; ALIGNED-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:428
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:428
+; ALIGNED-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:424
+; ALIGNED-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:420
+; ALIGNED-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:416
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v48 offset:94
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v48 offset:92
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v34 offset:90
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v34 offset:88
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v27 offset:86
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v27 offset:84
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v21 offset:82
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v21 offset:80
+; ALIGNED-NEXT:    buffer_store_dword v116, off, s[0:3], s32 offset:432
+; ALIGNED-NEXT:    buffer_store_dword v117, off, s[0:3], s32 offset:436
+; ALIGNED-NEXT:    buffer_store_dword v118, off, s[0:3], s32 offset:440
+; ALIGNED-NEXT:    buffer_store_dword v119, off, s[0:3], s32 offset:444
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:444
+; ALIGNED-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:440
+; ALIGNED-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:436
+; ALIGNED-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:432
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v35 offset:78
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v35 offset:76
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v28 offset:74
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v28 offset:72
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v22 offset:70
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v22 offset:68
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v19 offset:66
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v19 offset:64
+; ALIGNED-NEXT:    buffer_store_dword v112, off, s[0:3], s32 offset:512
+; ALIGNED-NEXT:    buffer_store_dword v113, off, s[0:3], s32 offset:516
+; ALIGNED-NEXT:    buffer_store_dword v114, off, s[0:3], s32 offset:520
+; ALIGNED-NEXT:    buffer_store_dword v115, off, s[0:3], s32 offset:524
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:524
+; ALIGNED-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:520
+; ALIGNED-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:516
+; ALIGNED-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:512
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 24, v25
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v25, 8, v25
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v24
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v24, 8, v24
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v51
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 8, v51
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 24, v50
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v50, 8, v50
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v29 offset:62
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v29 offset:60
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v23 offset:58
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v23 offset:56
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v20 offset:54
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v20 offset:52
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v18 offset:50
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v18 offset:48
+; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:528
+; ALIGNED-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:532
+; ALIGNED-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:536
+; ALIGNED-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:540
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:536
+; ALIGNED-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:540
+; ALIGNED-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:528
+; ALIGNED-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:532
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v15 offset:42
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v15 offset:40
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v14 offset:46
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v14 offset:44
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v13 offset:34
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v13 offset:32
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v12 offset:38
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v12 offset:36
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:480
+; ALIGNED-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:484
+; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:488
+; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:492
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:492
+; ALIGNED-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:488
+; ALIGNED-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:484
+; ALIGNED-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:480
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v11 offset:30
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v11 offset:28
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v10 offset:26
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v10 offset:24
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v9 offset:22
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v9 offset:20
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v8 offset:18
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v8 offset:16
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:496
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:500
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:504
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:508
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:508
+; ALIGNED-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:504
+; ALIGNED-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:500
+; ALIGNED-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:496
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v112 offset:247
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 24, v65
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v102 offset:255
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v49
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v49, 8, v49
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v31 offset:253
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v31, 24, v36
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v36, 8, v36
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v103 offset:251
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v71
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 8, v71
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v30 offset:249
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v30, 24, v70
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 8, v70
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v65, 8, v65
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v25 offset:245
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v25, 24, v64
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v64, 8, v64
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v112 offset:215
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 24, v67
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v67, 8, v67
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v113 offset:243
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v87
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v87, 8, v87
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v24 offset:241
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v24, 24, v86
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 8, v86
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v114 offset:239
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v85
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v85, 8, v85
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v51 offset:237
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 24, v80
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v80, 8, v80
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v115 offset:235
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 24, v101
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 8, v101
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v50 offset:233
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v50, 24, v99
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 8, v99
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v102 offset:231
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v96
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v96, 8, v96
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v49 offset:229
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v49, 24, v81
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v81, 8, v81
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v31 offset:227
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v31, 24, v100
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 8, v100
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v36 offset:225
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v36, 24, v97
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v97, 8, v97
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v103 offset:223
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v82
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 8, v82
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v71 offset:221
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 24, v66
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v66, 8, v66
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v30 offset:219
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v30, 24, v98
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 8, v98
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v70 offset:217
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 24, v83
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v83, 8, v83
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v65 offset:213
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v65, 24, v52
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v52, 8, v52
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v25 offset:211
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v25, 24, v84
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v84, 8, v84
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v64 offset:209
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v64, 24, v68
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v67 offset:149
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v67, 24, v8
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v8, 8, v8
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v68, 8, v68
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v113 offset:207
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v53
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v53, 8, v53
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v87 offset:205
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v87, 24, v37
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v37, 8, v37
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v24 offset:203
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v24, 24, v69
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v69, 8, v69
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v86 offset:201
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v54
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v54, 8, v54
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v114 offset:199
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v38
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v38, 8, v38
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v85 offset:197
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v85, 24, v32
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v32, 8, v32
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v51 offset:195
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 24, v55
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v55, 8, v55
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v80 offset:193
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v80, 24, v39
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v39, 8, v39
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v115 offset:191
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 24, v33
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v33, 8, v33
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v101 offset:189
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 24, v26
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v26, 8, v26
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v50 offset:187
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v50, 24, v48
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v48, 8, v48
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v99 offset:185
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 24, v34
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v34, 8, v34
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v102 offset:183
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v27
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v27, 8, v27
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v96 offset:181
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v96, 24, v21
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v21, 8, v21
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v49 offset:179
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v49, 24, v35
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v35, 8, v35
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v81 offset:177
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v81, 24, v28
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v28, 8, v28
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v31 offset:175
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v31, 24, v22
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v22, 8, v22
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v100 offset:173
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 24, v19
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v19, 8, v19
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v36 offset:171
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v36, 24, v29
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v29, 8, v29
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v97 offset:169
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v97, 24, v23
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v23, 8, v23
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v103 offset:167
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v20
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v20, 8, v20
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v82 offset:165
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v18
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v18, 8, v18
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v71 offset:163
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 24, v15
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v15, 8, v15
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v66 offset:161
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v66, 24, v14
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v14, 8, v14
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v30 offset:159
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v30, 24, v13
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v13, 8, v13
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v98 offset:157
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 24, v12
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v12, 8, v12
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v70 offset:155
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 24, v11
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v11, 8, v11
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v83 offset:153
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v83, 24, v10
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v10, 8, v10
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v112 offset:151
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 24, v9
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v9, 8, v9
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v65 offset:147
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v52 offset:145
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v25 offset:143
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v84 offset:141
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v64 offset:139
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v68 offset:137
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v113 offset:135
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v53 offset:133
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v87 offset:131
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v37 offset:129
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v24 offset:127
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v69 offset:125
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v86 offset:123
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v54 offset:121
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v114 offset:119
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v38 offset:117
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v85 offset:115
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v32 offset:113
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v51 offset:111
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v55 offset:109
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v80 offset:107
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v39 offset:105
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v115 offset:103
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v33 offset:101
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v101 offset:99
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v26 offset:97
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v50 offset:95
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v48 offset:93
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v99 offset:91
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v34 offset:89
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v102 offset:87
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v27 offset:85
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v96 offset:83
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v21 offset:81
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v49 offset:79
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v35 offset:77
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v81 offset:75
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v28 offset:73
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v31 offset:71
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v22 offset:69
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v100 offset:67
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v19 offset:65
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v36 offset:63
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v29 offset:61
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v97 offset:59
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v23 offset:57
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v103 offset:55
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v20 offset:53
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v82 offset:51
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v18 offset:49
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v71 offset:43
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v15 offset:41
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v66 offset:47
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v14 offset:45
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v30 offset:35
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v13 offset:33
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v98 offset:39
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v12 offset:37
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v70 offset:31
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v11 offset:29
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v83 offset:27
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v10 offset:25
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v112 offset:23
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v9 offset:21
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v67 offset:19
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v8 offset:17
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v7 offset:14
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v7 offset:12
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v6 offset:10
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v6 offset:8
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v5 offset:6
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v5 offset:4
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[16:17], v4 offset:2
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v4
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v8, 24, v7
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v9, 24, v6
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v8 offset:15
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v7 offset:13
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v9 offset:11
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v6 offset:9
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v10 offset:7
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v5 offset:5
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v11 offset:3
+; ALIGNED-NEXT:    flat_store_byte v[16:17], v4 offset:1
+; ALIGNED-NEXT:    s_cbranch_scc0 .LBB5_5
+; ALIGNED-NEXT:  .LBB5_6: ; %Flow6
+; ALIGNED-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_dword v47, off, s[0:3], s32
+; ALIGNED-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4
+; ALIGNED-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8
+; ALIGNED-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12
+; ALIGNED-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:16
+; ALIGNED-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20
+; ALIGNED-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24
+; ALIGNED-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNROLL3-LABEL: memmove_p0_p0_sz2048:
+; UNROLL3:       ; %bb.0: ; %entry
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNROLL3-NEXT:    s_mov_b32 s4, exec_lo
+; UNROLL3-NEXT:    v_cmpx_ge_u64_e64 v[2:3], v[0:1]
+; UNROLL3-NEXT:    s_xor_b32 s6, exec_lo, s4
+; UNROLL3-NEXT:    s_cbranch_execz .LBB5_4
+; UNROLL3-NEXT:  ; %bb.1: ; %memmove_fwd_loop.preheader
+; UNROLL3-NEXT:    s_mov_b64 s[4:5], 0
+; UNROLL3-NEXT:    .p2align 6
+; UNROLL3-NEXT:  .LBB5_2: ; %memmove_fwd_loop
+; UNROLL3-NEXT:    ; =>This Inner Loop Header: Depth=1
+; UNROLL3-NEXT:    v_add_co_u32 v12, vcc_lo, v2, s4
+; UNROLL3-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo
+; UNROLL3-NEXT:    v_add_co_u32 v16, vcc_lo, v0, s4
+; UNROLL3-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; UNROLL3-NEXT:    s_clause 0x2
+; UNROLL3-NEXT:    flat_load_dwordx4 v[4:7], v[12:13]
+; UNROLL3-NEXT:    flat_load_dwordx4 v[8:11], v[12:13] offset:16
+; UNROLL3-NEXT:    flat_load_dwordx4 v[12:15], v[12:13] offset:32
+; UNROLL3-NEXT:    s_add_u32 s4, s4, 48
+; UNROLL3-NEXT:    s_addc_u32 s5, s5, 0
+; UNROLL3-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[4:7]
+; UNROLL3-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(2)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[8:11] offset:16
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(2)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[12:15] offset:32
+; UNROLL3-NEXT:    s_cmp_lg_u64 s[4:5], 0x7e0
+; UNROLL3-NEXT:    s_cbranch_scc1 .LBB5_2
+; UNROLL3-NEXT:  ; %bb.3: ; %memmove_fwd_residual
+; UNROLL3-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:2016
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:2016
+; UNROLL3-NEXT:    flat_load_dwordx4 v[2:5], v[2:3] offset:2032
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:2032
+; UNROLL3-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; UNROLL3-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; UNROLL3-NEXT:  .LBB5_4: ; %Flow3
+; UNROLL3-NEXT:    s_andn2_saveexec_b32 s8, s6
+; UNROLL3-NEXT:    s_cbranch_execz .LBB5_7
+; UNROLL3-NEXT:  ; %bb.5: ; %memmove_bwd_residual
+; UNROLL3-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:2032
+; UNROLL3-NEXT:    s_movk_i32 s6, 0xffd0
+; UNROLL3-NEXT:    s_mov_b64 s[4:5], 0x7b0
+; UNROLL3-NEXT:    s_mov_b32 s7, -1
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:2032
+; UNROLL3-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:2016
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:2016
+; UNROLL3-NEXT:    .p2align 6
+; UNROLL3-NEXT:  .LBB5_6: ; %memmove_bwd_loop
+; UNROLL3-NEXT:    ; =>This Inner Loop Header: Depth=1
+; UNROLL3-NEXT:    v_add_co_u32 v12, vcc_lo, v2, s4
+; UNROLL3-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo
+; UNROLL3-NEXT:    v_add_co_u32 v16, vcc_lo, v0, s4
+; UNROLL3-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; UNROLL3-NEXT:    s_clause 0x2
+; UNROLL3-NEXT:    flat_load_dwordx4 v[4:7], v[12:13]
+; UNROLL3-NEXT:    flat_load_dwordx4 v[8:11], v[12:13] offset:16
+; UNROLL3-NEXT:    flat_load_dwordx4 v[12:15], v[12:13] offset:32
+; UNROLL3-NEXT:    s_add_u32 s4, s4, 0xffffffd0
+; UNROLL3-NEXT:    s_addc_u32 s5, s5, -1
+; UNROLL3-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[4:7]
+; UNROLL3-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(2)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[8:11] offset:16
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(2)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[12:15] offset:32
+; UNROLL3-NEXT:    s_cmp_eq_u64 s[4:5], s[6:7]
+; UNROLL3-NEXT:    s_cbranch_scc0 .LBB5_6
+; UNROLL3-NEXT:  .LBB5_7: ; %Flow4
+; UNROLL3-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; UNROLL3-NEXT:    s_waitcnt lgkmcnt(0)
+; UNROLL3-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  tail call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 2048, i1 false)
+  ret void
+}
+
+define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p1_p1_sz2048:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s4, exec_lo
+; CHECK-NEXT:    v_cmpx_ge_u64_e64 v[2:3], v[0:1]
+; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execz .LBB6_3
+; CHECK-NEXT:  ; %bb.1: ; %memmove_fwd_loop.preheader
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:  .LBB6_2: ; %memmove_fwd_loop
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    v_add_co_u32 v96, vcc_lo, v2, s4
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v100, vcc_lo, v0, s4
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT:    s_clause 0xf
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[96:97], off offset:224
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[96:97], off offset:240
+; CHECK-NEXT:    global_load_dwordx4 v[12:15], v[96:97], off offset:192
+; CHECK-NEXT:    global_load_dwordx4 v[16:19], v[96:97], off offset:208
+; CHECK-NEXT:    global_load_dwordx4 v[20:23], v[96:97], off offset:160
+; CHECK-NEXT:    global_load_dwordx4 v[24:27], v[96:97], off offset:176
+; CHECK-NEXT:    global_load_dwordx4 v[28:31], v[96:97], off offset:128
+; CHECK-NEXT:    global_load_dwordx4 v[32:35], v[96:97], off offset:144
+; CHECK-NEXT:    global_load_dwordx4 v[36:39], v[96:97], off offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[48:51], v[96:97], off offset:112
+; CHECK-NEXT:    global_load_dwordx4 v[52:55], v[96:97], off offset:64
+; CHECK-NEXT:    global_load_dwordx4 v[64:67], v[96:97], off offset:80
+; CHECK-NEXT:    global_load_dwordx4 v[68:71], v[96:97], off offset:32
+; CHECK-NEXT:    global_load_dwordx4 v[80:83], v[96:97], off offset:48
+; CHECK-NEXT:    global_load_dwordx4 v[84:87], v[96:97], off
+; CHECK-NEXT:    global_load_dwordx4 v[96:99], v[96:97], off offset:16
+; CHECK-NEXT:    s_add_u32 s4, s4, 0x100
+; CHECK-NEXT:    s_addc_u32 s5, s5, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(15)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[4:7], off offset:224
+; CHECK-NEXT:    s_waitcnt vmcnt(14)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[8:11], off offset:240
+; CHECK-NEXT:    s_waitcnt vmcnt(13)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[12:15], off offset:192
+; CHECK-NEXT:    s_waitcnt vmcnt(12)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[16:19], off offset:208
+; CHECK-NEXT:    s_waitcnt vmcnt(11)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[20:23], off offset:160
+; CHECK-NEXT:    s_waitcnt vmcnt(10)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[24:27], off offset:176
+; CHECK-NEXT:    s_waitcnt vmcnt(9)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[28:31], off offset:128
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[32:35], off offset:144
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[36:39], off offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[48:51], off offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[52:55], off offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[64:67], off offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[68:71], off offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[80:83], off offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[84:87], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[96:99], off offset:16
+; CHECK-NEXT:    s_cmp_lg_u64 s[4:5], 0x800
+; CHECK-NEXT:    s_cbranch_scc1 .LBB6_2
+; CHECK-NEXT:  .LBB6_3: ; %Flow9
+; CHECK-NEXT:    s_andn2_saveexec_b32 s8, s6
+; CHECK-NEXT:    s_cbranch_execz .LBB6_6
+; CHECK-NEXT:  ; %bb.4: ; %memmove_bwd_loop.preheader
+; CHECK-NEXT:    s_movk_i32 s6, 0xff00
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0x700
+; CHECK-NEXT:    s_mov_b32 s7, -1
+; CHECK-NEXT:  .LBB6_5: ; %memmove_bwd_loop
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    v_add_co_u32 v96, vcc_lo, v2, s4
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v100, vcc_lo, v0, s4
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT:    s_clause 0xf
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[96:97], off offset:224
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[96:97], off offset:240
+; CHECK-NEXT:    global_load_dwordx4 v[12:15], v[96:97], off offset:192
+; CHECK-NEXT:    global_load_dwordx4 v[16:19], v[96:97], off offset:208
+; CHECK-NEXT:    global_load_dwordx4 v[20:23], v[96:97], off offset:160
+; CHECK-NEXT:    global_load_dwordx4 v[24:27], v[96:97], off offset:176
+; CHECK-NEXT:    global_load_dwordx4 v[28:31], v[96:97], off offset:128
+; CHECK-NEXT:    global_load_dwordx4 v[32:35], v[96:97], off offset:144
+; CHECK-NEXT:    global_load_dwordx4 v[36:39], v[96:97], off offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[48:51], v[96:97], off offset:112
+; CHECK-NEXT:    global_load_dwordx4 v[52:55], v[96:97], off offset:64
+; CHECK-NEXT:    global_load_dwordx4 v[64:67], v[96:97], off offset:80
+; CHECK-NEXT:    global_load_dwordx4 v[68:71], v[96:97], off offset:32
+; CHECK-NEXT:    global_load_dwordx4 v[80:83], v[96:97], off offset:48
+; CHECK-NEXT:    global_load_dwordx4 v[84:87], v[96:97], off
+; CHECK-NEXT:    global_load_dwordx4 v[96:99], v[96:97], off offset:16
+; CHECK-NEXT:    s_add_u32 s4, s4, 0xffffff00
+; CHECK-NEXT:    s_addc_u32 s5, s5, -1
+; CHECK-NEXT:    s_waitcnt vmcnt(15)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[4:7], off offset:224
+; CHECK-NEXT:    s_waitcnt vmcnt(14)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[8:11], off offset:240
+; CHECK-NEXT:    s_waitcnt vmcnt(13)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[12:15], off offset:192
+; CHECK-NEXT:    s_waitcnt vmcnt(12)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[16:19], off offset:208
+; CHECK-NEXT:    s_waitcnt vmcnt(11)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[20:23], off offset:160
+; CHECK-NEXT:    s_waitcnt vmcnt(10)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[24:27], off offset:176
+; CHECK-NEXT:    s_waitcnt vmcnt(9)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[28:31], off offset:128
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[32:35], off offset:144
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[36:39], off offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[48:51], off offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[52:55], off offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[64:67], off offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[68:71], off offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[80:83], off offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[84:87], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v[100:101], v[96:99], off offset:16
+; CHECK-NEXT:    s_cmp_eq_u64 s[4:5], s[6:7]
+; CHECK-NEXT:    s_cbranch_scc0 .LBB6_5
+; CHECK-NEXT:  .LBB6_6: ; %Flow10
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; ALIGNED-LABEL: memmove_p1_p1_sz2048:
+; ALIGNED:       ; %bb.0: ; %entry
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_mov_b32 s4, exec_lo
+; ALIGNED-NEXT:    v_cmpx_ge_u64_e64 v[2:3], v[0:1]
+; ALIGNED-NEXT:    s_xor_b32 s6, exec_lo, s4
+; ALIGNED-NEXT:    s_cbranch_execz .LBB6_3
+; ALIGNED-NEXT:  ; %bb.1: ; %memmove_fwd_loop.preheader
+; ALIGNED-NEXT:    s_mov_b64 s[4:5], 0
+; ALIGNED-NEXT:  .LBB6_2: ; %memmove_fwd_loop
+; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ALIGNED-NEXT:    v_add_co_u32 v20, vcc_lo, v2, s4
+; ALIGNED-NEXT:    v_add_co_ci_u32_e32 v21, vcc_lo, s5, v3, vcc_lo
+; ALIGNED-NEXT:    s_clause 0xf
+; ALIGNED-NEXT:    global_load_dwordx4 v[16:19], v[20:21], off offset:240
+; ALIGNED-NEXT:    global_load_dwordx4 v[22:25], v[20:21], off offset:224
+; ALIGNED-NEXT:    global_load_dwordx4 v[4:7], v[20:21], off
+; ALIGNED-NEXT:    global_load_dwordx4 v[8:11], v[20:21], off offset:16
+; ALIGNED-NEXT:    global_load_dwordx4 v[12:15], v[20:21], off offset:32
+; ALIGNED-NEXT:    global_load_dwordx4 v[98:101], v[20:21], off offset:48
+; ALIGNED-NEXT:    global_load_dwordx4 v[112:115], v[20:21], off offset:64
+; ALIGNED-NEXT:    global_load_dwordx4 v[82:85], v[20:21], off offset:80
+; ALIGNED-NEXT:    global_load_dwordx4 v[116:119], v[20:21], off offset:96
+; ALIGNED-NEXT:    global_load_dwordx4 v[66:69], v[20:21], off offset:112
+; ALIGNED-NEXT:    global_load_dwordx4 v[40:43], v[20:21], off offset:128
+; ALIGNED-NEXT:    global_load_dwordx4 v[50:53], v[20:21], off offset:144
+; ALIGNED-NEXT:    global_load_dwordx4 v[44:47], v[20:21], off offset:160
+; ALIGNED-NEXT:    global_load_dwordx4 v[34:37], v[20:21], off offset:176
+; ALIGNED-NEXT:    global_load_dwordx4 v[30:33], v[20:21], off offset:192
+; ALIGNED-NEXT:    global_load_dwordx4 v[26:29], v[20:21], off offset:208
+; ALIGNED-NEXT:    s_waitcnt vmcnt(15)
+; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:64
+; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:68
+; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:72
+; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:76
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:76
+; ALIGNED-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:72
+; ALIGNED-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:68
+; ALIGNED-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64
+; ALIGNED-NEXT:    v_add_co_u32 v16, vcc_lo, v0, s4
+; ALIGNED-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; ALIGNED-NEXT:    s_add_u32 s4, s4, 0x100
+; ALIGNED-NEXT:    s_addc_u32 s5, s5, 0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v21, off offset:254
+; ALIGNED-NEXT:    global_store_byte v[16:17], v21, off offset:252
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v20, off offset:250
+; ALIGNED-NEXT:    global_store_byte v[16:17], v20, off offset:248
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v19, off offset:246
+; ALIGNED-NEXT:    global_store_byte v[16:17], v19, off offset:244
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v18, off offset:242
+; ALIGNED-NEXT:    global_store_byte v[16:17], v18, off offset:240
+; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:80
+; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:84
+; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:88
+; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:92
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:92
+; ALIGNED-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:88
+; ALIGNED-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:84
+; ALIGNED-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:80
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v21
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v21, 8, v21
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v20
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v20, 8, v20
+; ALIGNED-NEXT:    s_cmp_lg_u64 s[4:5], 0x800
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v25, off offset:238
+; ALIGNED-NEXT:    global_store_byte v[16:17], v25, off offset:236
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v24, off offset:234
+; ALIGNED-NEXT:    global_store_byte v[16:17], v24, off offset:232
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v23, off offset:230
+; ALIGNED-NEXT:    global_store_byte v[16:17], v23, off offset:228
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v22, off offset:226
+; ALIGNED-NEXT:    global_store_byte v[16:17], v22, off offset:224
+; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:32
+; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:36
+; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:40
+; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:44
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:44
+; ALIGNED-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:40
+; ALIGNED-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:36
+; ALIGNED-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:32
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v29, off offset:222
+; ALIGNED-NEXT:    global_store_byte v[16:17], v29, off offset:220
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v28, off offset:218
+; ALIGNED-NEXT:    global_store_byte v[16:17], v28, off offset:216
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v27, off offset:214
+; ALIGNED-NEXT:    global_store_byte v[16:17], v27, off offset:212
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v26, off offset:210
+; ALIGNED-NEXT:    global_store_byte v[16:17], v26, off offset:208
+; ALIGNED-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:48
+; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:52
+; ALIGNED-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:56
+; ALIGNED-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:60
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
+; ALIGNED-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; ALIGNED-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:52
+; ALIGNED-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:48
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v33, off offset:206
+; ALIGNED-NEXT:    global_store_byte v[16:17], v33, off offset:204
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v32, off offset:202
+; ALIGNED-NEXT:    global_store_byte v[16:17], v32, off offset:200
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v31, off offset:198
+; ALIGNED-NEXT:    global_store_byte v[16:17], v31, off offset:196
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v30, off offset:194
+; ALIGNED-NEXT:    global_store_byte v[16:17], v30, off offset:192
+; ALIGNED-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:128
+; ALIGNED-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:132
+; ALIGNED-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:136
+; ALIGNED-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:140
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:140
+; ALIGNED-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:136
+; ALIGNED-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:132
+; ALIGNED-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:128
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v37, off offset:190
+; ALIGNED-NEXT:    global_store_byte v[16:17], v37, off offset:188
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v36, off offset:186
+; ALIGNED-NEXT:    global_store_byte v[16:17], v36, off offset:184
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v35, off offset:182
+; ALIGNED-NEXT:    global_store_byte v[16:17], v35, off offset:180
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v34, off offset:178
+; ALIGNED-NEXT:    global_store_byte v[16:17], v34, off offset:176
+; ALIGNED-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:144
+; ALIGNED-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:148
+; ALIGNED-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:152
+; ALIGNED-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:156
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:156
+; ALIGNED-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:152
+; ALIGNED-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:148
+; ALIGNED-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:144
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v49, off offset:174
+; ALIGNED-NEXT:    global_store_byte v[16:17], v49, off offset:172
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v48, off offset:170
+; ALIGNED-NEXT:    global_store_byte v[16:17], v48, off offset:168
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v39, off offset:166
+; ALIGNED-NEXT:    global_store_byte v[16:17], v39, off offset:164
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v38, off offset:162
+; ALIGNED-NEXT:    global_store_byte v[16:17], v38, off offset:160
+; ALIGNED-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:96
+; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:100
+; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:104
+; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:108
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:108
+; ALIGNED-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:104
+; ALIGNED-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:100
+; ALIGNED-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:96
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v53, off offset:158
+; ALIGNED-NEXT:    global_store_byte v[16:17], v53, off offset:156
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v52, off offset:154
+; ALIGNED-NEXT:    global_store_byte v[16:17], v52, off offset:152
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v51, off offset:150
+; ALIGNED-NEXT:    global_store_byte v[16:17], v51, off offset:148
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v50, off offset:146
+; ALIGNED-NEXT:    global_store_byte v[16:17], v50, off offset:144
+; ALIGNED-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:112
+; ALIGNED-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:116
+; ALIGNED-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:120
+; ALIGNED-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:124
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v65, off, s[0:3], s32 offset:124
+; ALIGNED-NEXT:    buffer_load_dword v64, off, s[0:3], s32 offset:120
+; ALIGNED-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:116
+; ALIGNED-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:112
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v65, off offset:142
+; ALIGNED-NEXT:    global_store_byte v[16:17], v65, off offset:140
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v64, off offset:138
+; ALIGNED-NEXT:    global_store_byte v[16:17], v64, off offset:136
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v55, off offset:134
+; ALIGNED-NEXT:    global_store_byte v[16:17], v55, off offset:132
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v54, off offset:130
+; ALIGNED-NEXT:    global_store_byte v[16:17], v54, off offset:128
+; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:196
+; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:204
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v69, off, s[0:3], s32 offset:204
+; ALIGNED-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT:    buffer_load_dword v67, off, s[0:3], s32 offset:196
+; ALIGNED-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v69, off offset:126
+; ALIGNED-NEXT:    global_store_byte v[16:17], v69, off offset:124
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v68, off offset:122
+; ALIGNED-NEXT:    global_store_byte v[16:17], v68, off offset:120
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v67, off offset:118
+; ALIGNED-NEXT:    global_store_byte v[16:17], v67, off offset:116
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v66, off offset:114
+; ALIGNED-NEXT:    global_store_byte v[16:17], v66, off offset:112
+; ALIGNED-NEXT:    buffer_store_dword v116, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT:    buffer_store_dword v117, off, s[0:3], s32 offset:212
+; ALIGNED-NEXT:    buffer_store_dword v118, off, s[0:3], s32 offset:216
+; ALIGNED-NEXT:    buffer_store_dword v119, off, s[0:3], s32 offset:220
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v81, off, s[0:3], s32 offset:220
+; ALIGNED-NEXT:    buffer_load_dword v80, off, s[0:3], s32 offset:216
+; ALIGNED-NEXT:    buffer_load_dword v71, off, s[0:3], s32 offset:212
+; ALIGNED-NEXT:    buffer_load_dword v70, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v81, off offset:110
+; ALIGNED-NEXT:    global_store_byte v[16:17], v81, off offset:108
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v80, off offset:106
+; ALIGNED-NEXT:    global_store_byte v[16:17], v80, off offset:104
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v71, off offset:102
+; ALIGNED-NEXT:    global_store_byte v[16:17], v71, off offset:100
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v70, off offset:98
+; ALIGNED-NEXT:    global_store_byte v[16:17], v70, off offset:96
+; ALIGNED-NEXT:    buffer_store_dword v82, off, s[0:3], s32 offset:160
+; ALIGNED-NEXT:    buffer_store_dword v83, off, s[0:3], s32 offset:164
+; ALIGNED-NEXT:    buffer_store_dword v84, off, s[0:3], s32 offset:168
+; ALIGNED-NEXT:    buffer_store_dword v85, off, s[0:3], s32 offset:172
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v85, off, s[0:3], s32 offset:172
+; ALIGNED-NEXT:    buffer_load_dword v84, off, s[0:3], s32 offset:168
+; ALIGNED-NEXT:    buffer_load_dword v83, off, s[0:3], s32 offset:164
+; ALIGNED-NEXT:    buffer_load_dword v82, off, s[0:3], s32 offset:160
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v85, off offset:94
+; ALIGNED-NEXT:    global_store_byte v[16:17], v85, off offset:92
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v84, off offset:90
+; ALIGNED-NEXT:    global_store_byte v[16:17], v84, off offset:88
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v83, off offset:86
+; ALIGNED-NEXT:    global_store_byte v[16:17], v83, off offset:84
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v82, off offset:82
+; ALIGNED-NEXT:    global_store_byte v[16:17], v82, off offset:80
+; ALIGNED-NEXT:    buffer_store_dword v112, off, s[0:3], s32 offset:176
+; ALIGNED-NEXT:    buffer_store_dword v113, off, s[0:3], s32 offset:180
+; ALIGNED-NEXT:    buffer_store_dword v114, off, s[0:3], s32 offset:184
+; ALIGNED-NEXT:    buffer_store_dword v115, off, s[0:3], s32 offset:188
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v97, off, s[0:3], s32 offset:188
+; ALIGNED-NEXT:    buffer_load_dword v96, off, s[0:3], s32 offset:184
+; ALIGNED-NEXT:    buffer_load_dword v87, off, s[0:3], s32 offset:180
+; ALIGNED-NEXT:    buffer_load_dword v86, off, s[0:3], s32 offset:176
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 24, v19
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v19, 8, v19
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v18
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v18, 8, v18
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v25
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v25, 8, v25
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 24, v24
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v24, 8, v24
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v97, off offset:78
+; ALIGNED-NEXT:    global_store_byte v[16:17], v97, off offset:76
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v96, off offset:74
+; ALIGNED-NEXT:    global_store_byte v[16:17], v96, off offset:72
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v87, off offset:70
+; ALIGNED-NEXT:    global_store_byte v[16:17], v87, off offset:68
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v86, off offset:66
+; ALIGNED-NEXT:    global_store_byte v[16:17], v86, off offset:64
+; ALIGNED-NEXT:    buffer_store_dword v98, off, s[0:3], s32 offset:256
+; ALIGNED-NEXT:    buffer_store_dword v99, off, s[0:3], s32 offset:260
+; ALIGNED-NEXT:    buffer_store_dword v100, off, s[0:3], s32 offset:264
+; ALIGNED-NEXT:    buffer_store_dword v101, off, s[0:3], s32 offset:268
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v101, off, s[0:3], s32 offset:268
+; ALIGNED-NEXT:    buffer_load_dword v100, off, s[0:3], s32 offset:264
+; ALIGNED-NEXT:    buffer_load_dword v99, off, s[0:3], s32 offset:260
+; ALIGNED-NEXT:    buffer_load_dword v98, off, s[0:3], s32 offset:256
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v101, off offset:62
+; ALIGNED-NEXT:    global_store_byte v[16:17], v101, off offset:60
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v100, off offset:58
+; ALIGNED-NEXT:    global_store_byte v[16:17], v100, off offset:56
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v99, off offset:54
+; ALIGNED-NEXT:    global_store_byte v[16:17], v99, off offset:52
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v98, off offset:50
+; ALIGNED-NEXT:    global_store_byte v[16:17], v98, off offset:48
+; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:272
+; ALIGNED-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:280
+; ALIGNED-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:284
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:280
+; ALIGNED-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:284
+; ALIGNED-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:272
+; ALIGNED-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v15, off offset:42
+; ALIGNED-NEXT:    global_store_byte v[16:17], v15, off offset:40
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v14, off offset:46
+; ALIGNED-NEXT:    global_store_byte v[16:17], v14, off offset:44
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v13, off offset:34
+; ALIGNED-NEXT:    global_store_byte v[16:17], v13, off offset:32
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v12, off offset:38
+; ALIGNED-NEXT:    global_store_byte v[16:17], v12, off offset:36
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224
+; ALIGNED-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:228
+; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:232
+; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:236
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:236
+; ALIGNED-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:232
+; ALIGNED-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:228
+; ALIGNED-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:224
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v11, off offset:30
+; ALIGNED-NEXT:    global_store_byte v[16:17], v11, off offset:28
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v10, off offset:26
+; ALIGNED-NEXT:    global_store_byte v[16:17], v10, off offset:24
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v9, off offset:22
+; ALIGNED-NEXT:    global_store_byte v[16:17], v9, off offset:20
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v8, off offset:18
+; ALIGNED-NEXT:    global_store_byte v[16:17], v8, off offset:16
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:240
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:244
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:252
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:252
+; ALIGNED-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:244
+; ALIGNED-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:240
+; ALIGNED-NEXT:    global_store_byte v[16:17], v112, off offset:247
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 24, v27
+; ALIGNED-NEXT:    global_store_byte v[16:17], v102, off offset:255
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v23
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v23, 8, v23
+; ALIGNED-NEXT:    global_store_byte v[16:17], v21, off offset:253
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v21, 24, v22
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v22, 8, v22
+; ALIGNED-NEXT:    global_store_byte v[16:17], v103, off offset:251
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v29
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v29, 8, v29
+; ALIGNED-NEXT:    global_store_byte v[16:17], v20, off offset:249
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v20, 24, v28
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v28, 8, v28
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v27, 8, v27
+; ALIGNED-NEXT:    global_store_byte v[16:17], v19, off offset:245
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v19, 24, v26
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v26, 8, v26
+; ALIGNED-NEXT:    global_store_byte v[16:17], v112, off offset:215
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 24, v51
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 8, v51
+; ALIGNED-NEXT:    global_store_byte v[16:17], v113, off offset:243
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v33
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v33, 8, v33
+; ALIGNED-NEXT:    global_store_byte v[16:17], v18, off offset:241
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v18, 24, v32
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v32, 8, v32
+; ALIGNED-NEXT:    global_store_byte v[16:17], v114, off offset:239
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v31
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v31, 8, v31
+; ALIGNED-NEXT:    global_store_byte v[16:17], v25, off offset:237
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v25, 24, v30
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v30, 8, v30
+; ALIGNED-NEXT:    global_store_byte v[16:17], v115, off offset:235
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 24, v37
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v37, 8, v37
+; ALIGNED-NEXT:    global_store_byte v[16:17], v24, off offset:233
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v24, 24, v36
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v36, 8, v36
+; ALIGNED-NEXT:    global_store_byte v[16:17], v102, off offset:231
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v35
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v35, 8, v35
+; ALIGNED-NEXT:    global_store_byte v[16:17], v23, off offset:229
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v23, 24, v34
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v34, 8, v34
+; ALIGNED-NEXT:    global_store_byte v[16:17], v21, off offset:227
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v21, 24, v49
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v49, 8, v49
+; ALIGNED-NEXT:    global_store_byte v[16:17], v22, off offset:225
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v22, 24, v48
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v48, 8, v48
+; ALIGNED-NEXT:    global_store_byte v[16:17], v103, off offset:223
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v39
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v39, 8, v39
+; ALIGNED-NEXT:    global_store_byte v[16:17], v29, off offset:221
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v29, 24, v38
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v38, 8, v38
+; ALIGNED-NEXT:    global_store_byte v[16:17], v20, off offset:219
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v20, 24, v53
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v53, 8, v53
+; ALIGNED-NEXT:    global_store_byte v[16:17], v28, off offset:217
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v28, 24, v52
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v52, 8, v52
+; ALIGNED-NEXT:    global_store_byte v[16:17], v27, off offset:213
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v27, 24, v50
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v50, 8, v50
+; ALIGNED-NEXT:    global_store_byte v[16:17], v19, off offset:211
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v19, 24, v65
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v65, 8, v65
+; ALIGNED-NEXT:    global_store_byte v[16:17], v26, off offset:209
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v26, 24, v64
+; ALIGNED-NEXT:    global_store_byte v[16:17], v51, off offset:149
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 24, v8
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v8, 8, v8
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v64, 8, v64
+; ALIGNED-NEXT:    global_store_byte v[16:17], v113, off offset:207
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v55
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v55, 8, v55
+; ALIGNED-NEXT:    global_store_byte v[16:17], v33, off offset:205
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v33, 24, v54
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v54, 8, v54
+; ALIGNED-NEXT:    global_store_byte v[16:17], v18, off offset:203
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v18, 24, v69
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v69, 8, v69
+; ALIGNED-NEXT:    global_store_byte v[16:17], v32, off offset:201
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v32, 24, v68
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v68, 8, v68
+; ALIGNED-NEXT:    global_store_byte v[16:17], v114, off offset:199
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v67
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v67, 8, v67
+; ALIGNED-NEXT:    global_store_byte v[16:17], v31, off offset:197
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v31, 24, v66
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v66, 8, v66
+; ALIGNED-NEXT:    global_store_byte v[16:17], v25, off offset:195
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v25, 24, v81
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v81, 8, v81
+; ALIGNED-NEXT:    global_store_byte v[16:17], v30, off offset:193
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v30, 24, v80
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v80, 8, v80
+; ALIGNED-NEXT:    global_store_byte v[16:17], v115, off offset:191
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 24, v71
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 8, v71
+; ALIGNED-NEXT:    global_store_byte v[16:17], v37, off offset:189
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v37, 24, v70
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 8, v70
+; ALIGNED-NEXT:    global_store_byte v[16:17], v24, off offset:187
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v24, 24, v85
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v85, 8, v85
+; ALIGNED-NEXT:    global_store_byte v[16:17], v36, off offset:185
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v36, 24, v84
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v84, 8, v84
+; ALIGNED-NEXT:    global_store_byte v[16:17], v102, off offset:183
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v83
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v83, 8, v83
+; ALIGNED-NEXT:    global_store_byte v[16:17], v35, off offset:181
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v35, 24, v82
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 8, v82
+; ALIGNED-NEXT:    global_store_byte v[16:17], v23, off offset:179
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v23, 24, v97
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v97, 8, v97
+; ALIGNED-NEXT:    global_store_byte v[16:17], v34, off offset:177
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v34, 24, v96
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v96, 8, v96
+; ALIGNED-NEXT:    global_store_byte v[16:17], v21, off offset:175
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v21, 24, v87
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v87, 8, v87
+; ALIGNED-NEXT:    global_store_byte v[16:17], v49, off offset:173
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v49, 24, v86
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 8, v86
+; ALIGNED-NEXT:    global_store_byte v[16:17], v22, off offset:171
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v22, 24, v101
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 8, v101
+; ALIGNED-NEXT:    global_store_byte v[16:17], v48, off offset:169
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v48, 24, v100
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 8, v100
+; ALIGNED-NEXT:    global_store_byte v[16:17], v103, off offset:167
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v99
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 8, v99
+; ALIGNED-NEXT:    global_store_byte v[16:17], v39, off offset:165
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v39, 24, v98
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 8, v98
+; ALIGNED-NEXT:    global_store_byte v[16:17], v29, off offset:163
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v29, 24, v15
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v15, 8, v15
+; ALIGNED-NEXT:    global_store_byte v[16:17], v38, off offset:161
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v38, 24, v14
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v14, 8, v14
+; ALIGNED-NEXT:    global_store_byte v[16:17], v20, off offset:159
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v20, 24, v13
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v13, 8, v13
+; ALIGNED-NEXT:    global_store_byte v[16:17], v53, off offset:157
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v53, 24, v12
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v12, 8, v12
+; ALIGNED-NEXT:    global_store_byte v[16:17], v28, off offset:155
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v28, 24, v11
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v11, 8, v11
+; ALIGNED-NEXT:    global_store_byte v[16:17], v52, off offset:153
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v52, 24, v10
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v10, 8, v10
+; ALIGNED-NEXT:    global_store_byte v[16:17], v112, off offset:151
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 24, v9
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v9, 8, v9
+; ALIGNED-NEXT:    global_store_byte v[16:17], v27, off offset:147
+; ALIGNED-NEXT:    global_store_byte v[16:17], v50, off offset:145
+; ALIGNED-NEXT:    global_store_byte v[16:17], v19, off offset:143
+; ALIGNED-NEXT:    global_store_byte v[16:17], v65, off offset:141
+; ALIGNED-NEXT:    global_store_byte v[16:17], v26, off offset:139
+; ALIGNED-NEXT:    global_store_byte v[16:17], v64, off offset:137
+; ALIGNED-NEXT:    global_store_byte v[16:17], v113, off offset:135
+; ALIGNED-NEXT:    global_store_byte v[16:17], v55, off offset:133
+; ALIGNED-NEXT:    global_store_byte v[16:17], v33, off offset:131
+; ALIGNED-NEXT:    global_store_byte v[16:17], v54, off offset:129
+; ALIGNED-NEXT:    global_store_byte v[16:17], v18, off offset:127
+; ALIGNED-NEXT:    global_store_byte v[16:17], v69, off offset:125
+; ALIGNED-NEXT:    global_store_byte v[16:17], v32, off offset:123
+; ALIGNED-NEXT:    global_store_byte v[16:17], v68, off offset:121
+; ALIGNED-NEXT:    global_store_byte v[16:17], v114, off offset:119
+; ALIGNED-NEXT:    global_store_byte v[16:17], v67, off offset:117
+; ALIGNED-NEXT:    global_store_byte v[16:17], v31, off offset:115
+; ALIGNED-NEXT:    global_store_byte v[16:17], v66, off offset:113
+; ALIGNED-NEXT:    global_store_byte v[16:17], v25, off offset:111
+; ALIGNED-NEXT:    global_store_byte v[16:17], v81, off offset:109
+; ALIGNED-NEXT:    global_store_byte v[16:17], v30, off offset:107
+; ALIGNED-NEXT:    global_store_byte v[16:17], v80, off offset:105
+; ALIGNED-NEXT:    global_store_byte v[16:17], v115, off offset:103
+; ALIGNED-NEXT:    global_store_byte v[16:17], v71, off offset:101
+; ALIGNED-NEXT:    global_store_byte v[16:17], v37, off offset:99
+; ALIGNED-NEXT:    global_store_byte v[16:17], v70, off offset:97
+; ALIGNED-NEXT:    global_store_byte v[16:17], v24, off offset:95
+; ALIGNED-NEXT:    global_store_byte v[16:17], v85, off offset:93
+; ALIGNED-NEXT:    global_store_byte v[16:17], v36, off offset:91
+; ALIGNED-NEXT:    global_store_byte v[16:17], v84, off offset:89
+; ALIGNED-NEXT:    global_store_byte v[16:17], v102, off offset:87
+; ALIGNED-NEXT:    global_store_byte v[16:17], v83, off offset:85
+; ALIGNED-NEXT:    global_store_byte v[16:17], v35, off offset:83
+; ALIGNED-NEXT:    global_store_byte v[16:17], v82, off offset:81
+; ALIGNED-NEXT:    global_store_byte v[16:17], v23, off offset:79
+; ALIGNED-NEXT:    global_store_byte v[16:17], v97, off offset:77
+; ALIGNED-NEXT:    global_store_byte v[16:17], v34, off offset:75
+; ALIGNED-NEXT:    global_store_byte v[16:17], v96, off offset:73
+; ALIGNED-NEXT:    global_store_byte v[16:17], v21, off offset:71
+; ALIGNED-NEXT:    global_store_byte v[16:17], v87, off offset:69
+; ALIGNED-NEXT:    global_store_byte v[16:17], v49, off offset:67
+; ALIGNED-NEXT:    global_store_byte v[16:17], v86, off offset:65
+; ALIGNED-NEXT:    global_store_byte v[16:17], v22, off offset:63
+; ALIGNED-NEXT:    global_store_byte v[16:17], v101, off offset:61
+; ALIGNED-NEXT:    global_store_byte v[16:17], v48, off offset:59
+; ALIGNED-NEXT:    global_store_byte v[16:17], v100, off offset:57
+; ALIGNED-NEXT:    global_store_byte v[16:17], v103, off offset:55
+; ALIGNED-NEXT:    global_store_byte v[16:17], v99, off offset:53
+; ALIGNED-NEXT:    global_store_byte v[16:17], v39, off offset:51
+; ALIGNED-NEXT:    global_store_byte v[16:17], v98, off offset:49
+; ALIGNED-NEXT:    global_store_byte v[16:17], v29, off offset:43
+; ALIGNED-NEXT:    global_store_byte v[16:17], v15, off offset:41
+; ALIGNED-NEXT:    global_store_byte v[16:17], v38, off offset:47
+; ALIGNED-NEXT:    global_store_byte v[16:17], v14, off offset:45
+; ALIGNED-NEXT:    global_store_byte v[16:17], v20, off offset:35
+; ALIGNED-NEXT:    global_store_byte v[16:17], v13, off offset:33
+; ALIGNED-NEXT:    global_store_byte v[16:17], v53, off offset:39
+; ALIGNED-NEXT:    global_store_byte v[16:17], v12, off offset:37
+; ALIGNED-NEXT:    global_store_byte v[16:17], v28, off offset:31
+; ALIGNED-NEXT:    global_store_byte v[16:17], v11, off offset:29
+; ALIGNED-NEXT:    global_store_byte v[16:17], v52, off offset:27
+; ALIGNED-NEXT:    global_store_byte v[16:17], v10, off offset:25
+; ALIGNED-NEXT:    global_store_byte v[16:17], v112, off offset:23
+; ALIGNED-NEXT:    global_store_byte v[16:17], v9, off offset:21
+; ALIGNED-NEXT:    global_store_byte v[16:17], v51, off offset:19
+; ALIGNED-NEXT:    global_store_byte v[16:17], v8, off offset:17
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v7, off offset:14
+; ALIGNED-NEXT:    global_store_byte v[16:17], v7, off offset:12
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v6, off offset:10
+; ALIGNED-NEXT:    global_store_byte v[16:17], v6, off offset:8
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v5, off offset:6
+; ALIGNED-NEXT:    global_store_byte v[16:17], v5, off offset:4
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v4, off offset:2
+; ALIGNED-NEXT:    global_store_byte v[16:17], v4, off
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v8, 24, v7
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v9, 24, v6
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
+; ALIGNED-NEXT:    global_store_byte v[16:17], v8, off offset:15
+; ALIGNED-NEXT:    global_store_byte v[16:17], v7, off offset:13
+; ALIGNED-NEXT:    global_store_byte v[16:17], v9, off offset:11
+; ALIGNED-NEXT:    global_store_byte v[16:17], v6, off offset:9
+; ALIGNED-NEXT:    global_store_byte v[16:17], v10, off offset:7
+; ALIGNED-NEXT:    global_store_byte v[16:17], v5, off offset:5
+; ALIGNED-NEXT:    global_store_byte v[16:17], v11, off offset:3
+; ALIGNED-NEXT:    global_store_byte v[16:17], v4, off offset:1
+; ALIGNED-NEXT:    s_cbranch_scc1 .LBB6_2
+; ALIGNED-NEXT:  .LBB6_3: ; %Flow9
+; ALIGNED-NEXT:    s_andn2_saveexec_b32 s8, s6
+; ALIGNED-NEXT:    s_cbranch_execz .LBB6_6
+; ALIGNED-NEXT:  ; %bb.4: ; %memmove_bwd_loop.preheader
+; ALIGNED-NEXT:    s_movk_i32 s6, 0xff00
+; ALIGNED-NEXT:    s_mov_b64 s[4:5], 0x700
+; ALIGNED-NEXT:    s_mov_b32 s7, -1
+; ALIGNED-NEXT:  .LBB6_5: ; %memmove_bwd_loop
+; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ALIGNED-NEXT:    v_add_co_u32 v24, vcc_lo, v2, s4
+; ALIGNED-NEXT:    v_add_co_ci_u32_e32 v25, vcc_lo, s5, v3, vcc_lo
+; ALIGNED-NEXT:    s_clause 0xf
+; ALIGNED-NEXT:    global_load_dwordx4 v[16:19], v[24:25], off offset:240
+; ALIGNED-NEXT:    global_load_dwordx4 v[20:23], v[24:25], off offset:224
+; ALIGNED-NEXT:    global_load_dwordx4 v[4:7], v[24:25], off
+; ALIGNED-NEXT:    global_load_dwordx4 v[8:11], v[24:25], off offset:16
+; ALIGNED-NEXT:    global_load_dwordx4 v[12:15], v[24:25], off offset:32
+; ALIGNED-NEXT:    global_load_dwordx4 v[112:115], v[24:25], off offset:48
+; ALIGNED-NEXT:    global_load_dwordx4 v[116:119], v[24:25], off offset:64
+; ALIGNED-NEXT:    global_load_dwordx4 v[40:43], v[24:25], off offset:80
+; ALIGNED-NEXT:    global_load_dwordx4 v[26:29], v[24:25], off offset:96
+; ALIGNED-NEXT:    global_load_dwordx4 v[32:35], v[24:25], off offset:112
+; ALIGNED-NEXT:    global_load_dwordx4 v[44:47], v[24:25], off offset:128
+; ALIGNED-NEXT:    global_load_dwordx4 v[52:55], v[24:25], off offset:144
+; ALIGNED-NEXT:    global_load_dwordx4 v[66:69], v[24:25], off offset:160
+; ALIGNED-NEXT:    global_load_dwordx4 v[81:84], v[24:25], off offset:176
+; ALIGNED-NEXT:    global_load_dwordx4 v[96:99], v[24:25], off offset:192
+; ALIGNED-NEXT:    global_load_dwordx4 v[100:103], v[24:25], off offset:208
+; ALIGNED-NEXT:    s_waitcnt vmcnt(15)
+; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:320
+; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:324
+; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:328
+; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:332
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:332
+; ALIGNED-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:328
+; ALIGNED-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:324
+; ALIGNED-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:320
+; ALIGNED-NEXT:    v_add_co_u32 v16, vcc_lo, v0, s4
+; ALIGNED-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; ALIGNED-NEXT:    s_add_u32 s4, s4, 0xffffff00
+; ALIGNED-NEXT:    s_addc_u32 s5, s5, -1
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v31, off offset:254
+; ALIGNED-NEXT:    global_store_byte v[16:17], v31, off offset:252
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v30, off offset:250
+; ALIGNED-NEXT:    global_store_byte v[16:17], v30, off offset:248
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v25, off offset:246
+; ALIGNED-NEXT:    global_store_byte v[16:17], v25, off offset:244
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v24, off offset:242
+; ALIGNED-NEXT:    global_store_byte v[16:17], v24, off offset:240
+; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:336
+; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:340
+; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:344
+; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:348
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:348
+; ALIGNED-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:344
+; ALIGNED-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:340
+; ALIGNED-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:336
+; ALIGNED-NEXT:    s_cmp_eq_u64 s[4:5], s[6:7]
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v51, off offset:238
+; ALIGNED-NEXT:    global_store_byte v[16:17], v51, off offset:236
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v50, off offset:234
+; ALIGNED-NEXT:    global_store_byte v[16:17], v50, off offset:232
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v49, off offset:230
+; ALIGNED-NEXT:    global_store_byte v[16:17], v49, off offset:228
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v36, off offset:226
+; ALIGNED-NEXT:    global_store_byte v[16:17], v36, off offset:224
+; ALIGNED-NEXT:    buffer_store_dword v100, off, s[0:3], s32 offset:288
+; ALIGNED-NEXT:    buffer_store_dword v101, off, s[0:3], s32 offset:292
+; ALIGNED-NEXT:    buffer_store_dword v102, off, s[0:3], s32 offset:296
+; ALIGNED-NEXT:    buffer_store_dword v103, off, s[0:3], s32 offset:300
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v71, off, s[0:3], s32 offset:300
+; ALIGNED-NEXT:    buffer_load_dword v70, off, s[0:3], s32 offset:296
+; ALIGNED-NEXT:    buffer_load_dword v65, off, s[0:3], s32 offset:292
+; ALIGNED-NEXT:    buffer_load_dword v64, off, s[0:3], s32 offset:288
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v31
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v31, 8, v31
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v30
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v30, 8, v30
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v71, off offset:222
+; ALIGNED-NEXT:    global_store_byte v[16:17], v71, off offset:220
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v70, off offset:218
+; ALIGNED-NEXT:    global_store_byte v[16:17], v70, off offset:216
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v65, off offset:214
+; ALIGNED-NEXT:    global_store_byte v[16:17], v65, off offset:212
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v64, off offset:210
+; ALIGNED-NEXT:    global_store_byte v[16:17], v64, off offset:208
+; ALIGNED-NEXT:    buffer_store_dword v96, off, s[0:3], s32 offset:304
+; ALIGNED-NEXT:    buffer_store_dword v97, off, s[0:3], s32 offset:308
+; ALIGNED-NEXT:    buffer_store_dword v98, off, s[0:3], s32 offset:312
+; ALIGNED-NEXT:    buffer_store_dword v99, off, s[0:3], s32 offset:316
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v87, off, s[0:3], s32 offset:316
+; ALIGNED-NEXT:    buffer_load_dword v86, off, s[0:3], s32 offset:312
+; ALIGNED-NEXT:    buffer_load_dword v85, off, s[0:3], s32 offset:308
+; ALIGNED-NEXT:    buffer_load_dword v80, off, s[0:3], s32 offset:304
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v87, off offset:206
+; ALIGNED-NEXT:    global_store_byte v[16:17], v87, off offset:204
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v86, off offset:202
+; ALIGNED-NEXT:    global_store_byte v[16:17], v86, off offset:200
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v85, off offset:198
+; ALIGNED-NEXT:    global_store_byte v[16:17], v85, off offset:196
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v80, off offset:194
+; ALIGNED-NEXT:    global_store_byte v[16:17], v80, off offset:192
+; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:384
+; ALIGNED-NEXT:    buffer_store_dword v82, off, s[0:3], s32 offset:388
+; ALIGNED-NEXT:    buffer_store_dword v83, off, s[0:3], s32 offset:392
+; ALIGNED-NEXT:    buffer_store_dword v84, off, s[0:3], s32 offset:396
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v101, off, s[0:3], s32 offset:396
+; ALIGNED-NEXT:    buffer_load_dword v99, off, s[0:3], s32 offset:392
+; ALIGNED-NEXT:    buffer_load_dword v96, off, s[0:3], s32 offset:388
+; ALIGNED-NEXT:    buffer_load_dword v81, off, s[0:3], s32 offset:384
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v101, off offset:190
+; ALIGNED-NEXT:    global_store_byte v[16:17], v101, off offset:188
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v99, off offset:186
+; ALIGNED-NEXT:    global_store_byte v[16:17], v99, off offset:184
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v96, off offset:182
+; ALIGNED-NEXT:    global_store_byte v[16:17], v96, off offset:180
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v81, off offset:178
+; ALIGNED-NEXT:    global_store_byte v[16:17], v81, off offset:176
+; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:400
+; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:404
+; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:408
+; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:412
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v100, off, s[0:3], s32 offset:412
+; ALIGNED-NEXT:    buffer_load_dword v97, off, s[0:3], s32 offset:408
+; ALIGNED-NEXT:    buffer_load_dword v82, off, s[0:3], s32 offset:404
+; ALIGNED-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:400
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v100, off offset:174
+; ALIGNED-NEXT:    global_store_byte v[16:17], v100, off offset:172
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v97, off offset:170
+; ALIGNED-NEXT:    global_store_byte v[16:17], v97, off offset:168
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v82, off offset:166
+; ALIGNED-NEXT:    global_store_byte v[16:17], v82, off offset:164
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v66, off offset:162
+; ALIGNED-NEXT:    global_store_byte v[16:17], v66, off offset:160
+; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:352
+; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:356
+; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:360
+; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:364
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v98, off, s[0:3], s32 offset:364
+; ALIGNED-NEXT:    buffer_load_dword v83, off, s[0:3], s32 offset:360
+; ALIGNED-NEXT:    buffer_load_dword v67, off, s[0:3], s32 offset:356
+; ALIGNED-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:352
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v98, off offset:158
+; ALIGNED-NEXT:    global_store_byte v[16:17], v98, off offset:156
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v83, off offset:154
+; ALIGNED-NEXT:    global_store_byte v[16:17], v83, off offset:152
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v67, off offset:150
+; ALIGNED-NEXT:    global_store_byte v[16:17], v67, off offset:148
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v52, off offset:146
+; ALIGNED-NEXT:    global_store_byte v[16:17], v52, off offset:144
+; ALIGNED-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:368
+; ALIGNED-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:372
+; ALIGNED-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:376
+; ALIGNED-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:380
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v84, off, s[0:3], s32 offset:380
+; ALIGNED-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:376
+; ALIGNED-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:372
+; ALIGNED-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:368
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v84, off offset:142
+; ALIGNED-NEXT:    global_store_byte v[16:17], v84, off offset:140
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v68, off offset:138
+; ALIGNED-NEXT:    global_store_byte v[16:17], v68, off offset:136
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v53, off offset:134
+; ALIGNED-NEXT:    global_store_byte v[16:17], v53, off offset:132
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v37, off offset:130
+; ALIGNED-NEXT:    global_store_byte v[16:17], v37, off offset:128
+; ALIGNED-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:448
+; ALIGNED-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:452
+; ALIGNED-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:456
+; ALIGNED-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:460
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v69, off, s[0:3], s32 offset:460
+; ALIGNED-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:456
+; ALIGNED-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:452
+; ALIGNED-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:448
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v69, off offset:126
+; ALIGNED-NEXT:    global_store_byte v[16:17], v69, off offset:124
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v54, off offset:122
+; ALIGNED-NEXT:    global_store_byte v[16:17], v54, off offset:120
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v38, off offset:118
+; ALIGNED-NEXT:    global_store_byte v[16:17], v38, off offset:116
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v32, off offset:114
+; ALIGNED-NEXT:    global_store_byte v[16:17], v32, off offset:112
+; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:464
+; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:468
+; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:472
+; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:476
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:476
+; ALIGNED-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:472
+; ALIGNED-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:468
+; ALIGNED-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:464
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v55, off offset:110
+; ALIGNED-NEXT:    global_store_byte v[16:17], v55, off offset:108
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v39, off offset:106
+; ALIGNED-NEXT:    global_store_byte v[16:17], v39, off offset:104
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v33, off offset:102
+; ALIGNED-NEXT:    global_store_byte v[16:17], v33, off offset:100
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v26, off offset:98
+; ALIGNED-NEXT:    global_store_byte v[16:17], v26, off offset:96
+; ALIGNED-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:416
+; ALIGNED-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:420
+; ALIGNED-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:424
+; ALIGNED-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:428
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:428
+; ALIGNED-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:424
+; ALIGNED-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:420
+; ALIGNED-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:416
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v48, off offset:94
+; ALIGNED-NEXT:    global_store_byte v[16:17], v48, off offset:92
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v34, off offset:90
+; ALIGNED-NEXT:    global_store_byte v[16:17], v34, off offset:88
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v27, off offset:86
+; ALIGNED-NEXT:    global_store_byte v[16:17], v27, off offset:84
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v21, off offset:82
+; ALIGNED-NEXT:    global_store_byte v[16:17], v21, off offset:80
+; ALIGNED-NEXT:    buffer_store_dword v116, off, s[0:3], s32 offset:432
+; ALIGNED-NEXT:    buffer_store_dword v117, off, s[0:3], s32 offset:436
+; ALIGNED-NEXT:    buffer_store_dword v118, off, s[0:3], s32 offset:440
+; ALIGNED-NEXT:    buffer_store_dword v119, off, s[0:3], s32 offset:444
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:444
+; ALIGNED-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:440
+; ALIGNED-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:436
+; ALIGNED-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:432
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v35, off offset:78
+; ALIGNED-NEXT:    global_store_byte v[16:17], v35, off offset:76
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v28, off offset:74
+; ALIGNED-NEXT:    global_store_byte v[16:17], v28, off offset:72
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v22, off offset:70
+; ALIGNED-NEXT:    global_store_byte v[16:17], v22, off offset:68
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v19, off offset:66
+; ALIGNED-NEXT:    global_store_byte v[16:17], v19, off offset:64
+; ALIGNED-NEXT:    buffer_store_dword v112, off, s[0:3], s32 offset:512
+; ALIGNED-NEXT:    buffer_store_dword v113, off, s[0:3], s32 offset:516
+; ALIGNED-NEXT:    buffer_store_dword v114, off, s[0:3], s32 offset:520
+; ALIGNED-NEXT:    buffer_store_dword v115, off, s[0:3], s32 offset:524
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:524
+; ALIGNED-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:520
+; ALIGNED-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:516
+; ALIGNED-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:512
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 24, v25
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v25, 8, v25
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v24
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v24, 8, v24
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v51
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 8, v51
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 24, v50
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v50, 8, v50
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v29, off offset:62
+; ALIGNED-NEXT:    global_store_byte v[16:17], v29, off offset:60
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v23, off offset:58
+; ALIGNED-NEXT:    global_store_byte v[16:17], v23, off offset:56
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v20, off offset:54
+; ALIGNED-NEXT:    global_store_byte v[16:17], v20, off offset:52
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v18, off offset:50
+; ALIGNED-NEXT:    global_store_byte v[16:17], v18, off offset:48
+; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:528
+; ALIGNED-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:532
+; ALIGNED-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:536
+; ALIGNED-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:540
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:536
+; ALIGNED-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:540
+; ALIGNED-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:528
+; ALIGNED-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:532
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v15, off offset:42
+; ALIGNED-NEXT:    global_store_byte v[16:17], v15, off offset:40
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v14, off offset:46
+; ALIGNED-NEXT:    global_store_byte v[16:17], v14, off offset:44
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v13, off offset:34
+; ALIGNED-NEXT:    global_store_byte v[16:17], v13, off offset:32
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v12, off offset:38
+; ALIGNED-NEXT:    global_store_byte v[16:17], v12, off offset:36
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:480
+; ALIGNED-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:484
+; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:488
+; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:492
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:492
+; ALIGNED-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:488
+; ALIGNED-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:484
+; ALIGNED-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:480
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v11, off offset:30
+; ALIGNED-NEXT:    global_store_byte v[16:17], v11, off offset:28
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v10, off offset:26
+; ALIGNED-NEXT:    global_store_byte v[16:17], v10, off offset:24
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v9, off offset:22
+; ALIGNED-NEXT:    global_store_byte v[16:17], v9, off offset:20
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v8, off offset:18
+; ALIGNED-NEXT:    global_store_byte v[16:17], v8, off offset:16
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:496
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:500
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:504
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:508
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:508
+; ALIGNED-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:504
+; ALIGNED-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:500
+; ALIGNED-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:496
+; ALIGNED-NEXT:    global_store_byte v[16:17], v112, off offset:247
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 24, v65
+; ALIGNED-NEXT:    global_store_byte v[16:17], v102, off offset:255
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v49
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v49, 8, v49
+; ALIGNED-NEXT:    global_store_byte v[16:17], v31, off offset:253
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v31, 24, v36
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v36, 8, v36
+; ALIGNED-NEXT:    global_store_byte v[16:17], v103, off offset:251
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v71
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 8, v71
+; ALIGNED-NEXT:    global_store_byte v[16:17], v30, off offset:249
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v30, 24, v70
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 8, v70
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v65, 8, v65
+; ALIGNED-NEXT:    global_store_byte v[16:17], v25, off offset:245
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v25, 24, v64
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v64, 8, v64
+; ALIGNED-NEXT:    global_store_byte v[16:17], v112, off offset:215
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 24, v67
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v67, 8, v67
+; ALIGNED-NEXT:    global_store_byte v[16:17], v113, off offset:243
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v87
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v87, 8, v87
+; ALIGNED-NEXT:    global_store_byte v[16:17], v24, off offset:241
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v24, 24, v86
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 8, v86
+; ALIGNED-NEXT:    global_store_byte v[16:17], v114, off offset:239
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v85
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v85, 8, v85
+; ALIGNED-NEXT:    global_store_byte v[16:17], v51, off offset:237
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 24, v80
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v80, 8, v80
+; ALIGNED-NEXT:    global_store_byte v[16:17], v115, off offset:235
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 24, v101
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 8, v101
+; ALIGNED-NEXT:    global_store_byte v[16:17], v50, off offset:233
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v50, 24, v99
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 8, v99
+; ALIGNED-NEXT:    global_store_byte v[16:17], v102, off offset:231
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v96
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v96, 8, v96
+; ALIGNED-NEXT:    global_store_byte v[16:17], v49, off offset:229
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v49, 24, v81
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v81, 8, v81
+; ALIGNED-NEXT:    global_store_byte v[16:17], v31, off offset:227
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v31, 24, v100
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 8, v100
+; ALIGNED-NEXT:    global_store_byte v[16:17], v36, off offset:225
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v36, 24, v97
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v97, 8, v97
+; ALIGNED-NEXT:    global_store_byte v[16:17], v103, off offset:223
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v82
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 8, v82
+; ALIGNED-NEXT:    global_store_byte v[16:17], v71, off offset:221
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 24, v66
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v66, 8, v66
+; ALIGNED-NEXT:    global_store_byte v[16:17], v30, off offset:219
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v30, 24, v98
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 8, v98
+; ALIGNED-NEXT:    global_store_byte v[16:17], v70, off offset:217
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 24, v83
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v83, 8, v83
+; ALIGNED-NEXT:    global_store_byte v[16:17], v65, off offset:213
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v65, 24, v52
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v52, 8, v52
+; ALIGNED-NEXT:    global_store_byte v[16:17], v25, off offset:211
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v25, 24, v84
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v84, 8, v84
+; ALIGNED-NEXT:    global_store_byte v[16:17], v64, off offset:209
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v64, 24, v68
+; ALIGNED-NEXT:    global_store_byte v[16:17], v67, off offset:149
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v67, 24, v8
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v8, 8, v8
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v68, 8, v68
+; ALIGNED-NEXT:    global_store_byte v[16:17], v113, off offset:207
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v53
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v53, 8, v53
+; ALIGNED-NEXT:    global_store_byte v[16:17], v87, off offset:205
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v87, 24, v37
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v37, 8, v37
+; ALIGNED-NEXT:    global_store_byte v[16:17], v24, off offset:203
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v24, 24, v69
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v69, 8, v69
+; ALIGNED-NEXT:    global_store_byte v[16:17], v86, off offset:201
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v54
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v54, 8, v54
+; ALIGNED-NEXT:    global_store_byte v[16:17], v114, off offset:199
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v38
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v38, 8, v38
+; ALIGNED-NEXT:    global_store_byte v[16:17], v85, off offset:197
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v85, 24, v32
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v32, 8, v32
+; ALIGNED-NEXT:    global_store_byte v[16:17], v51, off offset:195
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 24, v55
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v55, 8, v55
+; ALIGNED-NEXT:    global_store_byte v[16:17], v80, off offset:193
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v80, 24, v39
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v39, 8, v39
+; ALIGNED-NEXT:    global_store_byte v[16:17], v115, off offset:191
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 24, v33
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v33, 8, v33
+; ALIGNED-NEXT:    global_store_byte v[16:17], v101, off offset:189
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 24, v26
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v26, 8, v26
+; ALIGNED-NEXT:    global_store_byte v[16:17], v50, off offset:187
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v50, 24, v48
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v48, 8, v48
+; ALIGNED-NEXT:    global_store_byte v[16:17], v99, off offset:185
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 24, v34
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v34, 8, v34
+; ALIGNED-NEXT:    global_store_byte v[16:17], v102, off offset:183
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v27
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v27, 8, v27
+; ALIGNED-NEXT:    global_store_byte v[16:17], v96, off offset:181
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v96, 24, v21
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v21, 8, v21
+; ALIGNED-NEXT:    global_store_byte v[16:17], v49, off offset:179
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v49, 24, v35
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v35, 8, v35
+; ALIGNED-NEXT:    global_store_byte v[16:17], v81, off offset:177
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v81, 24, v28
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v28, 8, v28
+; ALIGNED-NEXT:    global_store_byte v[16:17], v31, off offset:175
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v31, 24, v22
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v22, 8, v22
+; ALIGNED-NEXT:    global_store_byte v[16:17], v100, off offset:173
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 24, v19
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v19, 8, v19
+; ALIGNED-NEXT:    global_store_byte v[16:17], v36, off offset:171
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v36, 24, v29
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v29, 8, v29
+; ALIGNED-NEXT:    global_store_byte v[16:17], v97, off offset:169
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v97, 24, v23
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v23, 8, v23
+; ALIGNED-NEXT:    global_store_byte v[16:17], v103, off offset:167
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v20
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v20, 8, v20
+; ALIGNED-NEXT:    global_store_byte v[16:17], v82, off offset:165
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v18
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v18, 8, v18
+; ALIGNED-NEXT:    global_store_byte v[16:17], v71, off offset:163
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 24, v15
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v15, 8, v15
+; ALIGNED-NEXT:    global_store_byte v[16:17], v66, off offset:161
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v66, 24, v14
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v14, 8, v14
+; ALIGNED-NEXT:    global_store_byte v[16:17], v30, off offset:159
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v30, 24, v13
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v13, 8, v13
+; ALIGNED-NEXT:    global_store_byte v[16:17], v98, off offset:157
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 24, v12
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v12, 8, v12
+; ALIGNED-NEXT:    global_store_byte v[16:17], v70, off offset:155
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 24, v11
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v11, 8, v11
+; ALIGNED-NEXT:    global_store_byte v[16:17], v83, off offset:153
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v83, 24, v10
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v10, 8, v10
+; ALIGNED-NEXT:    global_store_byte v[16:17], v112, off offset:151
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 24, v9
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v9, 8, v9
+; ALIGNED-NEXT:    global_store_byte v[16:17], v65, off offset:147
+; ALIGNED-NEXT:    global_store_byte v[16:17], v52, off offset:145
+; ALIGNED-NEXT:    global_store_byte v[16:17], v25, off offset:143
+; ALIGNED-NEXT:    global_store_byte v[16:17], v84, off offset:141
+; ALIGNED-NEXT:    global_store_byte v[16:17], v64, off offset:139
+; ALIGNED-NEXT:    global_store_byte v[16:17], v68, off offset:137
+; ALIGNED-NEXT:    global_store_byte v[16:17], v113, off offset:135
+; ALIGNED-NEXT:    global_store_byte v[16:17], v53, off offset:133
+; ALIGNED-NEXT:    global_store_byte v[16:17], v87, off offset:131
+; ALIGNED-NEXT:    global_store_byte v[16:17], v37, off offset:129
+; ALIGNED-NEXT:    global_store_byte v[16:17], v24, off offset:127
+; ALIGNED-NEXT:    global_store_byte v[16:17], v69, off offset:125
+; ALIGNED-NEXT:    global_store_byte v[16:17], v86, off offset:123
+; ALIGNED-NEXT:    global_store_byte v[16:17], v54, off offset:121
+; ALIGNED-NEXT:    global_store_byte v[16:17], v114, off offset:119
+; ALIGNED-NEXT:    global_store_byte v[16:17], v38, off offset:117
+; ALIGNED-NEXT:    global_store_byte v[16:17], v85, off offset:115
+; ALIGNED-NEXT:    global_store_byte v[16:17], v32, off offset:113
+; ALIGNED-NEXT:    global_store_byte v[16:17], v51, off offset:111
+; ALIGNED-NEXT:    global_store_byte v[16:17], v55, off offset:109
+; ALIGNED-NEXT:    global_store_byte v[16:17], v80, off offset:107
+; ALIGNED-NEXT:    global_store_byte v[16:17], v39, off offset:105
+; ALIGNED-NEXT:    global_store_byte v[16:17], v115, off offset:103
+; ALIGNED-NEXT:    global_store_byte v[16:17], v33, off offset:101
+; ALIGNED-NEXT:    global_store_byte v[16:17], v101, off offset:99
+; ALIGNED-NEXT:    global_store_byte v[16:17], v26, off offset:97
+; ALIGNED-NEXT:    global_store_byte v[16:17], v50, off offset:95
+; ALIGNED-NEXT:    global_store_byte v[16:17], v48, off offset:93
+; ALIGNED-NEXT:    global_store_byte v[16:17], v99, off offset:91
+; ALIGNED-NEXT:    global_store_byte v[16:17], v34, off offset:89
+; ALIGNED-NEXT:    global_store_byte v[16:17], v102, off offset:87
+; ALIGNED-NEXT:    global_store_byte v[16:17], v27, off offset:85
+; ALIGNED-NEXT:    global_store_byte v[16:17], v96, off offset:83
+; ALIGNED-NEXT:    global_store_byte v[16:17], v21, off offset:81
+; ALIGNED-NEXT:    global_store_byte v[16:17], v49, off offset:79
+; ALIGNED-NEXT:    global_store_byte v[16:17], v35, off offset:77
+; ALIGNED-NEXT:    global_store_byte v[16:17], v81, off offset:75
+; ALIGNED-NEXT:    global_store_byte v[16:17], v28, off offset:73
+; ALIGNED-NEXT:    global_store_byte v[16:17], v31, off offset:71
+; ALIGNED-NEXT:    global_store_byte v[16:17], v22, off offset:69
+; ALIGNED-NEXT:    global_store_byte v[16:17], v100, off offset:67
+; ALIGNED-NEXT:    global_store_byte v[16:17], v19, off offset:65
+; ALIGNED-NEXT:    global_store_byte v[16:17], v36, off offset:63
+; ALIGNED-NEXT:    global_store_byte v[16:17], v29, off offset:61
+; ALIGNED-NEXT:    global_store_byte v[16:17], v97, off offset:59
+; ALIGNED-NEXT:    global_store_byte v[16:17], v23, off offset:57
+; ALIGNED-NEXT:    global_store_byte v[16:17], v103, off offset:55
+; ALIGNED-NEXT:    global_store_byte v[16:17], v20, off offset:53
+; ALIGNED-NEXT:    global_store_byte v[16:17], v82, off offset:51
+; ALIGNED-NEXT:    global_store_byte v[16:17], v18, off offset:49
+; ALIGNED-NEXT:    global_store_byte v[16:17], v71, off offset:43
+; ALIGNED-NEXT:    global_store_byte v[16:17], v15, off offset:41
+; ALIGNED-NEXT:    global_store_byte v[16:17], v66, off offset:47
+; ALIGNED-NEXT:    global_store_byte v[16:17], v14, off offset:45
+; ALIGNED-NEXT:    global_store_byte v[16:17], v30, off offset:35
+; ALIGNED-NEXT:    global_store_byte v[16:17], v13, off offset:33
+; ALIGNED-NEXT:    global_store_byte v[16:17], v98, off offset:39
+; ALIGNED-NEXT:    global_store_byte v[16:17], v12, off offset:37
+; ALIGNED-NEXT:    global_store_byte v[16:17], v70, off offset:31
+; ALIGNED-NEXT:    global_store_byte v[16:17], v11, off offset:29
+; ALIGNED-NEXT:    global_store_byte v[16:17], v83, off offset:27
+; ALIGNED-NEXT:    global_store_byte v[16:17], v10, off offset:25
+; ALIGNED-NEXT:    global_store_byte v[16:17], v112, off offset:23
+; ALIGNED-NEXT:    global_store_byte v[16:17], v9, off offset:21
+; ALIGNED-NEXT:    global_store_byte v[16:17], v67, off offset:19
+; ALIGNED-NEXT:    global_store_byte v[16:17], v8, off offset:17
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v7, off offset:14
+; ALIGNED-NEXT:    global_store_byte v[16:17], v7, off offset:12
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v6, off offset:10
+; ALIGNED-NEXT:    global_store_byte v[16:17], v6, off offset:8
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v5, off offset:6
+; ALIGNED-NEXT:    global_store_byte v[16:17], v5, off offset:4
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    global_store_byte_d16_hi v[16:17], v4, off offset:2
+; ALIGNED-NEXT:    global_store_byte v[16:17], v4, off
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v8, 24, v7
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v9, 24, v6
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
+; ALIGNED-NEXT:    global_store_byte v[16:17], v8, off offset:15
+; ALIGNED-NEXT:    global_store_byte v[16:17], v7, off offset:13
+; ALIGNED-NEXT:    global_store_byte v[16:17], v9, off offset:11
+; ALIGNED-NEXT:    global_store_byte v[16:17], v6, off offset:9
+; ALIGNED-NEXT:    global_store_byte v[16:17], v10, off offset:7
+; ALIGNED-NEXT:    global_store_byte v[16:17], v5, off offset:5
+; ALIGNED-NEXT:    global_store_byte v[16:17], v11, off offset:3
+; ALIGNED-NEXT:    global_store_byte v[16:17], v4, off offset:1
+; ALIGNED-NEXT:    s_cbranch_scc0 .LBB6_5
+; ALIGNED-NEXT:  .LBB6_6: ; %Flow10
+; ALIGNED-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_dword v47, off, s[0:3], s32
+; ALIGNED-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4
+; ALIGNED-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8
+; ALIGNED-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12
+; ALIGNED-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:16
+; ALIGNED-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20
+; ALIGNED-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24
+; ALIGNED-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNROLL3-LABEL: memmove_p1_p1_sz2048:
+; UNROLL3:       ; %bb.0: ; %entry
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNROLL3-NEXT:    s_mov_b32 s4, exec_lo
+; UNROLL3-NEXT:    v_cmpx_ge_u64_e64 v[2:3], v[0:1]
+; UNROLL3-NEXT:    s_xor_b32 s6, exec_lo, s4
+; UNROLL3-NEXT:    s_cbranch_execz .LBB6_4
+; UNROLL3-NEXT:  ; %bb.1: ; %memmove_fwd_loop.preheader
+; UNROLL3-NEXT:    s_mov_b64 s[4:5], 0
+; UNROLL3-NEXT:    .p2align 6
+; UNROLL3-NEXT:  .LBB6_2: ; %memmove_fwd_loop
+; UNROLL3-NEXT:    ; =>This Inner Loop Header: Depth=1
+; UNROLL3-NEXT:    v_add_co_u32 v12, vcc_lo, v2, s4
+; UNROLL3-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo
+; UNROLL3-NEXT:    v_add_co_u32 v16, vcc_lo, v0, s4
+; UNROLL3-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; UNROLL3-NEXT:    s_clause 0x2
+; UNROLL3-NEXT:    global_load_dwordx4 v[4:7], v[12:13], off
+; UNROLL3-NEXT:    global_load_dwordx4 v[8:11], v[12:13], off offset:16
+; UNROLL3-NEXT:    global_load_dwordx4 v[12:15], v[12:13], off offset:32
+; UNROLL3-NEXT:    s_add_u32 s4, s4, 48
+; UNROLL3-NEXT:    s_addc_u32 s5, s5, 0
+; UNROLL3-NEXT:    s_waitcnt vmcnt(2)
+; UNROLL3-NEXT:    global_store_dwordx4 v[16:17], v[4:7], off
+; UNROLL3-NEXT:    s_waitcnt vmcnt(1)
+; UNROLL3-NEXT:    global_store_dwordx4 v[16:17], v[8:11], off offset:16
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    global_store_dwordx4 v[16:17], v[12:15], off offset:32
+; UNROLL3-NEXT:    s_cmp_lg_u64 s[4:5], 0x7e0
+; UNROLL3-NEXT:    s_cbranch_scc1 .LBB6_2
+; UNROLL3-NEXT:  ; %bb.3: ; %memmove_fwd_residual
+; UNROLL3-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:2016
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:2016
+; UNROLL3-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off offset:2032
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:2032
+; UNROLL3-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; UNROLL3-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; UNROLL3-NEXT:  .LBB6_4: ; %Flow7
+; UNROLL3-NEXT:    s_andn2_saveexec_b32 s8, s6
+; UNROLL3-NEXT:    s_cbranch_execz .LBB6_7
+; UNROLL3-NEXT:  ; %bb.5: ; %memmove_bwd_residual
+; UNROLL3-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:2032
+; UNROLL3-NEXT:    s_movk_i32 s6, 0xffd0
+; UNROLL3-NEXT:    s_mov_b64 s[4:5], 0x7b0
+; UNROLL3-NEXT:    s_mov_b32 s7, -1
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:2032
+; UNROLL3-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:2016
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:2016
+; UNROLL3-NEXT:    .p2align 6
+; UNROLL3-NEXT:  .LBB6_6: ; %memmove_bwd_loop
+; UNROLL3-NEXT:    ; =>This Inner Loop Header: Depth=1
+; UNROLL3-NEXT:    v_add_co_u32 v12, vcc_lo, v2, s4
+; UNROLL3-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo
+; UNROLL3-NEXT:    v_add_co_u32 v16, vcc_lo, v0, s4
+; UNROLL3-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; UNROLL3-NEXT:    s_clause 0x2
+; UNROLL3-NEXT:    global_load_dwordx4 v[4:7], v[12:13], off
+; UNROLL3-NEXT:    global_load_dwordx4 v[8:11], v[12:13], off offset:16
+; UNROLL3-NEXT:    global_load_dwordx4 v[12:15], v[12:13], off offset:32
+; UNROLL3-NEXT:    s_add_u32 s4, s4, 0xffffffd0
+; UNROLL3-NEXT:    s_addc_u32 s5, s5, -1
+; UNROLL3-NEXT:    s_waitcnt vmcnt(2)
+; UNROLL3-NEXT:    global_store_dwordx4 v[16:17], v[4:7], off
+; UNROLL3-NEXT:    s_waitcnt vmcnt(1)
+; UNROLL3-NEXT:    global_store_dwordx4 v[16:17], v[8:11], off offset:16
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    global_store_dwordx4 v[16:17], v[12:15], off offset:32
+; UNROLL3-NEXT:    s_cmp_eq_u64 s[4:5], s[6:7]
+; UNROLL3-NEXT:    s_cbranch_scc0 .LBB6_6
+; UNROLL3-NEXT:  .LBB6_7: ; %Flow8
+; UNROLL3-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; UNROLL3-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 2048, i1 false)
+  ret void
+}
+
+define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p0_p4_sz2048:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s4, exec_lo
+; CHECK-NEXT:    v_cmpx_ge_u64_e64 v[2:3], v[0:1]
+; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execz .LBB7_3
+; CHECK-NEXT:  ; %bb.1: ; %memmove_fwd_loop.preheader
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:  .LBB7_2: ; %memmove_fwd_loop
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    v_add_co_u32 v96, vcc_lo, v2, s4
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v100, vcc_lo, v0, s4
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT:    s_clause 0xf
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[96:97], off offset:240
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[96:97], off offset:224
+; CHECK-NEXT:    global_load_dwordx4 v[12:15], v[96:97], off offset:208
+; CHECK-NEXT:    global_load_dwordx4 v[16:19], v[96:97], off offset:192
+; CHECK-NEXT:    global_load_dwordx4 v[20:23], v[96:97], off offset:176
+; CHECK-NEXT:    global_load_dwordx4 v[24:27], v[96:97], off offset:160
+; CHECK-NEXT:    global_load_dwordx4 v[28:31], v[96:97], off offset:144
+; CHECK-NEXT:    global_load_dwordx4 v[32:35], v[96:97], off offset:128
+; CHECK-NEXT:    global_load_dwordx4 v[36:39], v[96:97], off offset:112
+; CHECK-NEXT:    global_load_dwordx4 v[48:51], v[96:97], off offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[52:55], v[96:97], off offset:80
+; CHECK-NEXT:    global_load_dwordx4 v[64:67], v[96:97], off offset:64
+; CHECK-NEXT:    global_load_dwordx4 v[68:71], v[96:97], off offset:48
+; CHECK-NEXT:    global_load_dwordx4 v[80:83], v[96:97], off offset:32
+; CHECK-NEXT:    global_load_dwordx4 v[84:87], v[96:97], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[96:99], v[96:97], off
+; CHECK-NEXT:    s_add_u32 s4, s4, 0x100
+; CHECK-NEXT:    s_addc_u32 s5, s5, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[4:7] offset:240
+; CHECK-NEXT:    s_waitcnt vmcnt(14)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[8:11] offset:224
+; CHECK-NEXT:    s_waitcnt vmcnt(13)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[12:15] offset:208
+; CHECK-NEXT:    s_waitcnt vmcnt(12)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[16:19] offset:192
+; CHECK-NEXT:    s_waitcnt vmcnt(11)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[20:23] offset:176
+; CHECK-NEXT:    s_waitcnt vmcnt(10)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[24:27] offset:160
+; CHECK-NEXT:    s_waitcnt vmcnt(9)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[28:31] offset:144
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[32:35] offset:128
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[36:39] offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[48:51] offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[52:55] offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[64:67] offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[68:71] offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[80:83] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[84:87] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[96:99]
+; CHECK-NEXT:    s_cmp_lg_u64 s[4:5], 0x800
+; CHECK-NEXT:    s_cbranch_scc1 .LBB7_2
+; CHECK-NEXT:  .LBB7_3: ; %Flow6
+; CHECK-NEXT:    s_andn2_saveexec_b32 s8, s6
+; CHECK-NEXT:    s_cbranch_execz .LBB7_6
+; CHECK-NEXT:  ; %bb.4: ; %memmove_bwd_loop.preheader
+; CHECK-NEXT:    s_movk_i32 s6, 0xff00
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0x700
+; CHECK-NEXT:    s_mov_b32 s7, -1
+; CHECK-NEXT:  .LBB7_5: ; %memmove_bwd_loop
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    v_add_co_u32 v96, vcc_lo, v2, s4
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v97, vcc_lo, s5, v3, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v100, vcc_lo, v0, s4
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT:    s_clause 0xf
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[96:97], off offset:240
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[96:97], off offset:224
+; CHECK-NEXT:    global_load_dwordx4 v[12:15], v[96:97], off offset:208
+; CHECK-NEXT:    global_load_dwordx4 v[16:19], v[96:97], off offset:192
+; CHECK-NEXT:    global_load_dwordx4 v[20:23], v[96:97], off offset:176
+; CHECK-NEXT:    global_load_dwordx4 v[24:27], v[96:97], off offset:160
+; CHECK-NEXT:    global_load_dwordx4 v[28:31], v[96:97], off offset:144
+; CHECK-NEXT:    global_load_dwordx4 v[32:35], v[96:97], off offset:128
+; CHECK-NEXT:    global_load_dwordx4 v[36:39], v[96:97], off offset:112
+; CHECK-NEXT:    global_load_dwordx4 v[48:51], v[96:97], off offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[52:55], v[96:97], off offset:80
+; CHECK-NEXT:    global_load_dwordx4 v[64:67], v[96:97], off offset:64
+; CHECK-NEXT:    global_load_dwordx4 v[68:71], v[96:97], off offset:48
+; CHECK-NEXT:    global_load_dwordx4 v[80:83], v[96:97], off offset:32
+; CHECK-NEXT:    global_load_dwordx4 v[84:87], v[96:97], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[96:99], v[96:97], off
+; CHECK-NEXT:    s_add_u32 s4, s4, 0xffffff00
+; CHECK-NEXT:    s_addc_u32 s5, s5, -1
+; CHECK-NEXT:    s_waitcnt vmcnt(15)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[4:7] offset:240
+; CHECK-NEXT:    s_waitcnt vmcnt(14)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[8:11] offset:224
+; CHECK-NEXT:    s_waitcnt vmcnt(13)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[12:15] offset:208
+; CHECK-NEXT:    s_waitcnt vmcnt(12)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[16:19] offset:192
+; CHECK-NEXT:    s_waitcnt vmcnt(11)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[20:23] offset:176
+; CHECK-NEXT:    s_waitcnt vmcnt(10)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[24:27] offset:160
+; CHECK-NEXT:    s_waitcnt vmcnt(9)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[28:31] offset:144
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[32:35] offset:128
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[36:39] offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[48:51] offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[52:55] offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[64:67] offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[68:71] offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[80:83] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[84:87] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[96:99]
+; CHECK-NEXT:    s_cmp_eq_u64 s[4:5], s[6:7]
+; CHECK-NEXT:    s_cbranch_scc0 .LBB7_5
+; CHECK-NEXT:  .LBB7_6: ; %Flow7
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; ALIGNED-LABEL: memmove_p0_p4_sz2048:
+; ALIGNED:       ; %bb.0: ; %entry
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT:    s_mov_b32 s4, exec_lo
+; ALIGNED-NEXT:    v_cmpx_ge_u64_e64 v[2:3], v[0:1]
+; ALIGNED-NEXT:    s_xor_b32 s6, exec_lo, s4
+; ALIGNED-NEXT:    s_cbranch_execz .LBB7_3
+; ALIGNED-NEXT:  ; %bb.1: ; %memmove_fwd_loop.preheader
+; ALIGNED-NEXT:    s_mov_b64 s[4:5], 0
+; ALIGNED-NEXT:  .LBB7_2: ; %memmove_fwd_loop
+; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ALIGNED-NEXT:    v_add_co_u32 v4, vcc_lo, v2, s4
+; ALIGNED-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s5, v3, vcc_lo
+; ALIGNED-NEXT:    v_add_co_u32 v96, vcc_lo, v0, s4
+; ALIGNED-NEXT:    v_add_co_ci_u32_e32 v97, vcc_lo, s5, v1, vcc_lo
+; ALIGNED-NEXT:    s_clause 0xf
+; ALIGNED-NEXT:    global_load_dwordx4 v[112:115], v[4:5], off offset:240
+; ALIGNED-NEXT:    global_load_dwordx4 v[84:87], v[4:5], off offset:224
+; ALIGNED-NEXT:    global_load_dwordx4 v[80:83], v[4:5], off offset:208
+; ALIGNED-NEXT:    global_load_dwordx4 v[68:71], v[4:5], off offset:192
+; ALIGNED-NEXT:    global_load_dwordx4 v[64:67], v[4:5], off offset:176
+; ALIGNED-NEXT:    global_load_dwordx4 v[52:55], v[4:5], off offset:160
+; ALIGNED-NEXT:    global_load_dwordx4 v[48:51], v[4:5], off offset:144
+; ALIGNED-NEXT:    global_load_dwordx4 v[36:39], v[4:5], off offset:128
+; ALIGNED-NEXT:    global_load_dwordx4 v[32:35], v[4:5], off offset:112
+; ALIGNED-NEXT:    global_load_dwordx4 v[28:31], v[4:5], off offset:96
+; ALIGNED-NEXT:    global_load_dwordx4 v[24:27], v[4:5], off offset:80
+; ALIGNED-NEXT:    global_load_dwordx4 v[20:23], v[4:5], off offset:64
+; ALIGNED-NEXT:    global_load_dwordx4 v[16:19], v[4:5], off offset:48
+; ALIGNED-NEXT:    global_load_dwordx4 v[12:15], v[4:5], off offset:32
+; ALIGNED-NEXT:    global_load_dwordx4 v[8:11], v[4:5], off offset:16
+; ALIGNED-NEXT:    global_load_dwordx4 v[4:7], v[4:5], off
+; ALIGNED-NEXT:    s_add_u32 s4, s4, 0x100
+; ALIGNED-NEXT:    s_addc_u32 s5, s5, 0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(15)
+; ALIGNED-NEXT:    buffer_store_dword v114, off, s[0:3], s32 offset:168
+; ALIGNED-NEXT:    buffer_store_dword v115, off, s[0:3], s32 offset:172
+; ALIGNED-NEXT:    buffer_store_dword v113, off, s[0:3], s32 offset:164
+; ALIGNED-NEXT:    buffer_store_dword v112, off, s[0:3], s32 offset:160
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 24, v114
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 8, v114
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v114 offset:250
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v115 offset:254
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 24, v115
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v115 offset:252
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 8, v115
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v114 offset:248
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v113
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v113 offset:246
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 8, v113
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v113 offset:244
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v112 offset:242
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v112 offset:240
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v112
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 8, v112
+; ALIGNED-NEXT:    s_waitcnt vmcnt(14)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v86
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 8, v86
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v98 offset:251
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 24, v87
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v99 offset:249
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 8, v87
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v100 offset:255
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 24, v85
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v101 offset:253
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 8, v85
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v102 offset:247
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v84
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v103 offset:245
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 8, v84
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v113 offset:243
+; ALIGNED-NEXT:    s_waitcnt vmcnt(13)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v82
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v112 offset:241
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 8, v82
+; ALIGNED-NEXT:    buffer_store_dword v86, off, s[0:3], s32 offset:184
+; ALIGNED-NEXT:    buffer_store_dword v87, off, s[0:3], s32 offset:188
+; ALIGNED-NEXT:    buffer_store_dword v85, off, s[0:3], s32 offset:180
+; ALIGNED-NEXT:    buffer_store_dword v84, off, s[0:3], s32 offset:176
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v86 offset:234
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v87 offset:238
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v87 offset:236
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v86 offset:232
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v85 offset:230
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v85 offset:228
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v84 offset:226
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v84 offset:224
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v84, 24, v83
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v85, 8, v83
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v81
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v87, 8, v81
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v114 offset:235
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v80
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v115 offset:233
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 8, v80
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v98 offset:239
+; ALIGNED-NEXT:    s_waitcnt vmcnt(12)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 24, v70
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v99 offset:237
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 8, v70
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v100 offset:231
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 24, v71
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v101 offset:229
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 8, v71
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v102 offset:227
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v69
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v103 offset:225
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 8, v69
+; ALIGNED-NEXT:    buffer_store_dword v82, off, s[0:3], s32 offset:136
+; ALIGNED-NEXT:    buffer_store_dword v83, off, s[0:3], s32 offset:140
+; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:132
+; ALIGNED-NEXT:    buffer_store_dword v80, off, s[0:3], s32 offset:128
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v82 offset:218
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v83 offset:222
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v83 offset:220
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v82 offset:216
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v81 offset:214
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v81 offset:212
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v80 offset:210
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v80 offset:208
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v80, 24, v68
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v81, 8, v68
+; ALIGNED-NEXT:    s_waitcnt vmcnt(11)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v66
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v83, 8, v66
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v113 offset:219
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v67
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v112 offset:217
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 8, v67
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v84 offset:223
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v84, 24, v65
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v85 offset:221
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v85, 8, v65
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v86 offset:215
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v64
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v87 offset:213
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v87, 8, v64
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v114 offset:211
+; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v54
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v115 offset:209
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 8, v54
+; ALIGNED-NEXT:    buffer_store_dword v70, off, s[0:3], s32 offset:152
+; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:156
+; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:148
+; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:144
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v70 offset:202
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v71 offset:206
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v71 offset:204
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v70 offset:200
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v69 offset:198
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v69 offset:196
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v68 offset:194
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v68 offset:192
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 24, v52
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 8, v52
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v98 offset:203
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 24, v53
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v68, 24, v55
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v69, 8, v55
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v99 offset:201
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 8, v53
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v100 offset:207
+; ALIGNED-NEXT:    s_waitcnt vmcnt(9)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 24, v50
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v101 offset:205
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 8, v50
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v102 offset:199
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v51
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v103 offset:197
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 8, v51
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v80 offset:195
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v80, 24, v49
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v81 offset:193
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v81, 8, v49
+; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:232
+; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:236
+; ALIGNED-NEXT:    buffer_store_dword v65, off, s[0:3], s32 offset:228
+; ALIGNED-NEXT:    buffer_store_dword v64, off, s[0:3], s32 offset:224
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v66 offset:186
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v67 offset:190
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v67 offset:188
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v66 offset:184
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v65 offset:182
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v65 offset:180
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v64 offset:178
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v64 offset:176
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v64, 24, v48
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v65, 8, v48
+; ALIGNED-NEXT:    s_waitcnt vmcnt(8)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v66, 24, v38
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v67, 8, v38
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v82 offset:187
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v39
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v83 offset:185
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v83, 8, v39
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v113 offset:191
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v37
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v112 offset:189
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 8, v37
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v84 offset:183
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v84, 24, v36
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v85 offset:181
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v85, 8, v36
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v86 offset:179
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v34
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v87 offset:177
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v87, 8, v34
+; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:252
+; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:244
+; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:240
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v54 offset:170
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v54 offset:168
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v55 offset:174
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v55 offset:172
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v52 offset:162
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v52 offset:160
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v53 offset:166
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v53 offset:164
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v52, 24, v35
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v53, 8, v35
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v54, 24, v33
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v55, 8, v33
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v114 offset:171
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v32
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v115 offset:169
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 8, v32
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v70 offset:163
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 24, v31
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v71 offset:161
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 8, v31
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v98 offset:167
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 24, v29
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v68 offset:175
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v68, 24, v30
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v69 offset:173
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v69, 8, v30
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v99 offset:165
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 8, v29
+; ALIGNED-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:204
+; ALIGNED-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:196
+; ALIGNED-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v50 offset:154
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v51 offset:158
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v51 offset:156
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v50 offset:152
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v49 offset:150
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v49 offset:148
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v48 offset:146
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v48 offset:144
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v48, 24, v28
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v49, 8, v28
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v50, 24, v26
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 8, v26
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v100 offset:155
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v101 offset:153
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v102 offset:159
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v103 offset:157
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v80 offset:151
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v81 offset:149
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v64 offset:147
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v64, 24, v22
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v65 offset:145
+; ALIGNED-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:216
+; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:220
+; ALIGNED-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:212
+; ALIGNED-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v38 offset:138
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v39 offset:142
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v39 offset:140
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v38 offset:136
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v37 offset:134
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v37 offset:132
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v36 offset:130
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v36 offset:128
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v66 offset:139
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v67 offset:137
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v82 offset:143
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v18
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v83 offset:141
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v113 offset:135
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v112 offset:133
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v84 offset:131
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v85 offset:129
+; ALIGNED-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:40
+; ALIGNED-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:44
+; ALIGNED-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:36
+; ALIGNED-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:32
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v34 offset:122
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v35 offset:126
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v35 offset:124
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v34 offset:120
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v33 offset:118
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v33 offset:116
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v32 offset:114
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v32 offset:112
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v34, 24, v14
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v86 offset:123
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v87 offset:121
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v52 offset:127
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v53 offset:125
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v54 offset:119
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v55 offset:117
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v114 offset:115
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v10
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v115 offset:113
+; ALIGNED-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:56
+; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:60
+; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:52
+; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:48
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v30 offset:106
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v31 offset:110
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v31 offset:108
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v30 offset:104
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v29 offset:102
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v29 offset:100
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v28 offset:98
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v28 offset:96
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v70 offset:111
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 24, v6
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 24, v27
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v65, 8, v22
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v83, 8, v18
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v35, 8, v14
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 8, v10
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v71 offset:109
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 8, v6
+; ALIGNED-NEXT:    s_cmp_lg_u64 s[4:5], 0x800
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 8, v27
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v36, 24, v23
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v19
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v15
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v28, 24, v11
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v98 offset:103
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 24, v7
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 24, v25
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 8, v25
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v80, 24, v24
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v81, 8, v24
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v37, 8, v23
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v38, 24, v21
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v39, 8, v21
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v66, 24, v20
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v67, 8, v20
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 8, v19
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v84, 24, v17
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v85, 8, v17
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v32, 24, v16
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v87, 8, v15
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v52, 24, v13
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v53, 8, v13
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v54, 24, v12
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v55, 8, v12
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v29, 8, v11
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v30, 24, v9
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v31, 8, v9
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v68 offset:107
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v68, 24, v8
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v69 offset:105
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v69, 8, v8
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v99 offset:101
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 8, v7
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v48 offset:99
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v48, 24, v5
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v49 offset:97
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v49, 8, v5
+; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:8
+; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:12
+; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:4
+; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v50 offset:91
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v50, 24, v4
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v51 offset:89
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 8, v4
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v26 offset:90
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v27 offset:94
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v100 offset:95
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v27 offset:92
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v101 offset:93
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v26 offset:88
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v102 offset:87
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v25 offset:86
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v103 offset:85
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v25 offset:84
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v80 offset:83
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v24 offset:82
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v81 offset:81
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v24 offset:80
+; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:24
+; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:28
+; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:20
+; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:16
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v22 offset:74
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v64 offset:75
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v65 offset:73
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v23 offset:78
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v36 offset:79
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v23 offset:76
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v37 offset:77
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v22 offset:72
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v38 offset:71
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v21 offset:70
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v39 offset:69
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v21 offset:68
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v66 offset:67
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v20 offset:66
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v67 offset:65
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v20 offset:64
+; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:104
+; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:108
+; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:100
+; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:96
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v18 offset:58
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v82 offset:59
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v83 offset:57
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v19 offset:62
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v113 offset:63
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v19 offset:60
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v112 offset:61
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v18 offset:56
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v84 offset:55
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v17 offset:54
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v85 offset:53
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v17 offset:52
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v32 offset:51
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v16 offset:50
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v33 offset:49
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v16 offset:48
+; ALIGNED-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:120
+; ALIGNED-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:124
+; ALIGNED-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:116
+; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:112
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v14 offset:42
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v34 offset:43
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v35 offset:41
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v15 offset:46
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v86 offset:47
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v15 offset:44
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v87 offset:45
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v14 offset:40
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v52 offset:39
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v13 offset:38
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v53 offset:37
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v13 offset:36
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v54 offset:35
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v12 offset:34
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v55 offset:33
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v12 offset:32
+; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:72
+; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:76
+; ALIGNED-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:68
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:64
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v10 offset:26
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v114 offset:27
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v115 offset:25
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v11 offset:30
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v28 offset:31
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v11 offset:28
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v29 offset:29
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v10 offset:24
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v30 offset:23
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v9 offset:22
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v31 offset:21
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v9 offset:20
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v68 offset:19
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v8 offset:18
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v69 offset:17
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v8 offset:16
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:88
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:92
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:84
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:80
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v6 offset:10
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v70 offset:11
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v71 offset:9
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v7 offset:14
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v98 offset:15
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v7 offset:12
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v99 offset:13
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v6 offset:8
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v48 offset:7
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v5 offset:6
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v49 offset:5
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v5 offset:4
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v50 offset:3
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v4 offset:2
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v51 offset:1
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v4
+; ALIGNED-NEXT:    s_cbranch_scc1 .LBB7_2
+; ALIGNED-NEXT:  .LBB7_3: ; %Flow6
+; ALIGNED-NEXT:    s_andn2_saveexec_b32 s8, s6
+; ALIGNED-NEXT:    s_cbranch_execz .LBB7_6
+; ALIGNED-NEXT:  ; %bb.4: ; %memmove_bwd_loop.preheader
+; ALIGNED-NEXT:    s_movk_i32 s6, 0xff00
+; ALIGNED-NEXT:    s_mov_b64 s[4:5], 0x700
+; ALIGNED-NEXT:    s_mov_b32 s7, -1
+; ALIGNED-NEXT:  .LBB7_5: ; %memmove_bwd_loop
+; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ALIGNED-NEXT:    v_add_co_u32 v4, vcc_lo, v2, s4
+; ALIGNED-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s5, v3, vcc_lo
+; ALIGNED-NEXT:    v_add_co_u32 v96, vcc_lo, v0, s4
+; ALIGNED-NEXT:    v_add_co_ci_u32_e32 v97, vcc_lo, s5, v1, vcc_lo
+; ALIGNED-NEXT:    s_clause 0xf
+; ALIGNED-NEXT:    global_load_dwordx4 v[98:101], v[4:5], off offset:240
+; ALIGNED-NEXT:    global_load_dwordx4 v[84:87], v[4:5], off offset:224
+; ALIGNED-NEXT:    global_load_dwordx4 v[80:83], v[4:5], off offset:208
+; ALIGNED-NEXT:    global_load_dwordx4 v[68:71], v[4:5], off offset:192
+; ALIGNED-NEXT:    global_load_dwordx4 v[64:67], v[4:5], off offset:176
+; ALIGNED-NEXT:    global_load_dwordx4 v[52:55], v[4:5], off offset:160
+; ALIGNED-NEXT:    global_load_dwordx4 v[48:51], v[4:5], off offset:144
+; ALIGNED-NEXT:    global_load_dwordx4 v[36:39], v[4:5], off offset:128
+; ALIGNED-NEXT:    global_load_dwordx4 v[32:35], v[4:5], off offset:112
+; ALIGNED-NEXT:    global_load_dwordx4 v[28:31], v[4:5], off offset:96
+; ALIGNED-NEXT:    global_load_dwordx4 v[24:27], v[4:5], off offset:80
+; ALIGNED-NEXT:    global_load_dwordx4 v[20:23], v[4:5], off offset:64
+; ALIGNED-NEXT:    global_load_dwordx4 v[16:19], v[4:5], off offset:48
+; ALIGNED-NEXT:    global_load_dwordx4 v[12:15], v[4:5], off offset:32
+; ALIGNED-NEXT:    global_load_dwordx4 v[8:11], v[4:5], off offset:16
+; ALIGNED-NEXT:    global_load_dwordx4 v[4:7], v[4:5], off
+; ALIGNED-NEXT:    s_add_u32 s4, s4, 0xffffff00
+; ALIGNED-NEXT:    s_addc_u32 s5, s5, -1
+; ALIGNED-NEXT:    s_waitcnt vmcnt(15)
+; ALIGNED-NEXT:    buffer_store_dword v100, off, s[0:3], s32 offset:424
+; ALIGNED-NEXT:    buffer_store_dword v101, off, s[0:3], s32 offset:428
+; ALIGNED-NEXT:    buffer_store_dword v99, off, s[0:3], s32 offset:420
+; ALIGNED-NEXT:    buffer_store_dword v98, off, s[0:3], s32 offset:416
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v100 offset:250
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v101 offset:254
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v101 offset:252
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v100 offset:248
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v99 offset:246
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v99 offset:244
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v98 offset:242
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v98 offset:240
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v100
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 8, v100
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v101
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 8, v101
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 24, v99
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 8, v99
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 24, v98
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 8, v98
+; ALIGNED-NEXT:    s_waitcnt vmcnt(14)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v86
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 8, v86
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v113 offset:251
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v87
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v112 offset:249
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 8, v87
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v103 offset:255
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v85
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v102 offset:253
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 8, v85
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v101 offset:247
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 24, v84
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v100 offset:245
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 8, v84
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v99 offset:243
+; ALIGNED-NEXT:    s_waitcnt vmcnt(13)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 24, v82
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v98 offset:241
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 8, v82
+; ALIGNED-NEXT:    buffer_store_dword v86, off, s[0:3], s32 offset:440
+; ALIGNED-NEXT:    buffer_store_dword v87, off, s[0:3], s32 offset:444
+; ALIGNED-NEXT:    buffer_store_dword v85, off, s[0:3], s32 offset:436
+; ALIGNED-NEXT:    buffer_store_dword v84, off, s[0:3], s32 offset:432
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v86 offset:234
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v87 offset:238
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v87 offset:236
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v86 offset:232
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v85 offset:230
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v85 offset:228
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v84 offset:226
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v84 offset:224
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v84, 24, v83
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v85, 8, v83
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v81
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v87, 8, v81
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v114 offset:235
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v80
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v115 offset:233
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 8, v80
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v113 offset:239
+; ALIGNED-NEXT:    s_waitcnt vmcnt(12)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v70
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v112 offset:237
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 8, v70
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v103 offset:231
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v71
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v102 offset:229
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 8, v71
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v101 offset:227
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 24, v69
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v100 offset:225
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 8, v69
+; ALIGNED-NEXT:    buffer_store_dword v82, off, s[0:3], s32 offset:392
+; ALIGNED-NEXT:    buffer_store_dword v83, off, s[0:3], s32 offset:396
+; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:388
+; ALIGNED-NEXT:    buffer_store_dword v80, off, s[0:3], s32 offset:384
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v82 offset:218
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v83 offset:222
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v83 offset:220
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v82 offset:216
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v81 offset:214
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v81 offset:212
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v80 offset:210
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v80 offset:208
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v80, 24, v68
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v81, 8, v68
+; ALIGNED-NEXT:    s_waitcnt vmcnt(11)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v66
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v83, 8, v66
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v99 offset:219
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 24, v67
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v98 offset:217
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 8, v67
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v84 offset:223
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v84, 24, v65
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v85 offset:221
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v85, 8, v65
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v86 offset:215
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v64
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v87 offset:213
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v87, 8, v64
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v114 offset:211
+; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v54
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v115 offset:209
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 8, v54
+; ALIGNED-NEXT:    buffer_store_dword v70, off, s[0:3], s32 offset:408
+; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:412
+; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:404
+; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:400
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v70 offset:202
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v71 offset:206
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v71 offset:204
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v70 offset:200
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v69 offset:198
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v69 offset:196
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v68 offset:194
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v68 offset:192
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v69, 8, v55
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 24, v52
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 8, v52
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v68, 24, v55
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v113 offset:203
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v53
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v112 offset:201
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 8, v53
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v103 offset:207
+; ALIGNED-NEXT:    s_waitcnt vmcnt(9)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v50
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v102 offset:205
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 8, v50
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v101 offset:199
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 24, v51
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v100 offset:197
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 8, v51
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v80 offset:195
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v80, 24, v49
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v81 offset:193
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v81, 8, v49
+; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:488
+; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:492
+; ALIGNED-NEXT:    buffer_store_dword v65, off, s[0:3], s32 offset:484
+; ALIGNED-NEXT:    buffer_store_dword v64, off, s[0:3], s32 offset:480
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v66 offset:186
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v67 offset:190
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v67 offset:188
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v66 offset:184
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v65 offset:182
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v65 offset:180
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v64 offset:178
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v64 offset:176
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v64, 24, v48
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v65, 8, v48
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v82 offset:187
+; ALIGNED-NEXT:    s_waitcnt vmcnt(8)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v39
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v66, 24, v38
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v67, 8, v38
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v83 offset:185
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v83, 8, v39
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v99 offset:191
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 24, v37
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v98 offset:189
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 8, v37
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v84 offset:183
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v84, 24, v36
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v85 offset:181
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v85, 8, v36
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v86 offset:179
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v34
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v87 offset:177
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v87, 8, v34
+; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:508
+; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:504
+; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:500
+; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:496
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v54 offset:170
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v54 offset:168
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v55 offset:174
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v55 offset:172
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v52 offset:162
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v52 offset:160
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v53 offset:166
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v53 offset:164
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v53, 24, v35
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v54, 8, v35
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v55, 24, v33
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v116, 8, v33
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v114 offset:171
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v32
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v115 offset:169
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 8, v32
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v69 offset:173
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v69, 24, v31
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v70 offset:163
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 8, v31
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v71 offset:161
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 24, v29
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v68 offset:175
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v52, 24, v30
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v68, 8, v30
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v113 offset:167
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v112 offset:165
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 8, v29
+; ALIGNED-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:456
+; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:460
+; ALIGNED-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:452
+; ALIGNED-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:448
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v50 offset:154
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v51 offset:158
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v51 offset:156
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v50 offset:152
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v49 offset:150
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v49 offset:148
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v48 offset:146
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v48 offset:144
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v48, 24, v28
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v49, 8, v28
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v50, 24, v26
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v103 offset:155
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v102 offset:153
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v101 offset:159
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v100 offset:157
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v80 offset:151
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v81 offset:149
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v64 offset:147
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v65 offset:145
+; ALIGNED-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:472
+; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:476
+; ALIGNED-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:468
+; ALIGNED-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:464
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v38 offset:138
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v39 offset:142
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v39 offset:140
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v38 offset:136
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v37 offset:134
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v37 offset:132
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v36 offset:130
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v36 offset:128
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v82 offset:143
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v18
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 8, v26
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v66 offset:139
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v67 offset:137
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v83 offset:141
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v99 offset:135
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v98 offset:133
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v84 offset:131
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v85 offset:129
+; ALIGNED-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:296
+; ALIGNED-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:300
+; ALIGNED-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292
+; ALIGNED-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:288
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v34 offset:122
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v35 offset:126
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v35 offset:124
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v34 offset:120
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v33 offset:118
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v33 offset:116
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v32 offset:114
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v32 offset:112
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v34, 24, v14
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v86 offset:123
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v87 offset:121
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v53 offset:127
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v54 offset:125
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v55 offset:119
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v116 offset:117
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v114 offset:115
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v114, 24, v10
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v115 offset:113
+; ALIGNED-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:312
+; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:316
+; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:308
+; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:304
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v30 offset:106
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v31 offset:110
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v31 offset:108
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v30 offset:104
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v29 offset:102
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v29 offset:100
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v28 offset:98
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v28 offset:96
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v69 offset:111
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v69, 24, v6
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v103, 24, v27
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v83, 8, v18
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v35, 8, v14
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v115, 8, v10
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v70 offset:109
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 8, v6
+; ALIGNED-NEXT:    s_cmp_eq_u64 s[4:5], s[6:7]
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v102, 8, v27
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 24, v25
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 24, v19
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v15
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v28, 24, v11
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v71 offset:103
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 24, v7
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v100, 8, v25
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v80, 24, v24
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v81, 8, v24
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v64, 24, v22
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v65, 8, v22
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v36, 24, v23
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v37, 8, v23
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v38, 24, v21
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v39, 8, v21
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v66, 24, v20
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v67, 8, v20
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 8, v19
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v84, 24, v17
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v85, 8, v17
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v32, 24, v16
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v87, 8, v15
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v53, 24, v13
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v54, 8, v13
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v55, 24, v12
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 8, v12
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v29, 8, v11
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v30, 24, v9
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v31, 8, v9
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v52 offset:107
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v52, 24, v8
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v68 offset:105
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v68, 8, v8
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v112 offset:101
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v112, 8, v7
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v48 offset:99
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v48, 24, v5
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v49 offset:97
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v49, 8, v5
+; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:264
+; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:268
+; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:260
+; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:256
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v26 offset:90
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v27 offset:94
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v27 offset:92
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v26 offset:88
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v25 offset:86
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v25 offset:84
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v24 offset:82
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v24 offset:80
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v24, 24, v4
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v25, 8, v4
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v50 offset:91
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v51 offset:89
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v103 offset:95
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v102 offset:93
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v101 offset:87
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v100 offset:85
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v80 offset:83
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v81 offset:81
+; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:280
+; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:284
+; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:272
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v22 offset:74
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v23 offset:78
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v23 offset:76
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v22 offset:72
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v21 offset:70
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v21 offset:68
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v20 offset:66
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v20 offset:64
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v64 offset:75
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v65 offset:73
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v36 offset:79
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v37 offset:77
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v38 offset:71
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v39 offset:69
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v66 offset:67
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v67 offset:65
+; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:360
+; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:364
+; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:356
+; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:352
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v82 offset:59
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v18 offset:58
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v83 offset:57
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v19 offset:62
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v99 offset:63
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v19 offset:60
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v98 offset:61
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v18 offset:56
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v84 offset:55
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v17 offset:54
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v85 offset:53
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v17 offset:52
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v32 offset:51
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v16 offset:50
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v33 offset:49
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v16 offset:48
+; ALIGNED-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:376
+; ALIGNED-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:380
+; ALIGNED-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:372
+; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:368
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v14 offset:42
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v34 offset:43
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v35 offset:41
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v15 offset:46
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v86 offset:47
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v15 offset:44
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v87 offset:45
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v14 offset:40
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v53 offset:39
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v13 offset:38
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v54 offset:37
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v13 offset:36
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v55 offset:35
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v12 offset:34
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v113 offset:33
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v12 offset:32
+; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:328
+; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:332
+; ALIGNED-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:324
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:320
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v10 offset:26
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v114 offset:27
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v115 offset:25
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v11 offset:30
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v28 offset:31
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v11 offset:28
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v29 offset:29
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v10 offset:24
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v30 offset:23
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v9 offset:22
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v31 offset:21
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v9 offset:20
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v52 offset:19
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v8 offset:18
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v68 offset:17
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v8 offset:16
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:344
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:348
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:340
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:336
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v6 offset:10
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v69 offset:11
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v70 offset:9
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v7 offset:14
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v71 offset:15
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v7 offset:12
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v112 offset:13
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v6 offset:8
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v48 offset:7
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v5 offset:6
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v49 offset:5
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v5 offset:4
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v24 offset:3
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v4 offset:2
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v25 offset:1
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v4
+; ALIGNED-NEXT:    s_cbranch_scc0 .LBB7_5
+; ALIGNED-NEXT:  .LBB7_6: ; %Flow7
+; ALIGNED-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
+; ALIGNED-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNROLL3-LABEL: memmove_p0_p4_sz2048:
+; UNROLL3:       ; %bb.0: ; %entry
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNROLL3-NEXT:    s_mov_b32 s4, exec_lo
+; UNROLL3-NEXT:    v_cmpx_ge_u64_e64 v[2:3], v[0:1]
+; UNROLL3-NEXT:    s_xor_b32 s6, exec_lo, s4
+; UNROLL3-NEXT:    s_cbranch_execz .LBB7_4
+; UNROLL3-NEXT:  ; %bb.1: ; %memmove_fwd_loop.preheader
+; UNROLL3-NEXT:    s_mov_b64 s[4:5], 0
+; UNROLL3-NEXT:    .p2align 6
+; UNROLL3-NEXT:  .LBB7_2: ; %memmove_fwd_loop
+; UNROLL3-NEXT:    ; =>This Inner Loop Header: Depth=1
+; UNROLL3-NEXT:    v_add_co_u32 v12, vcc_lo, v2, s4
+; UNROLL3-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo
+; UNROLL3-NEXT:    v_add_co_u32 v16, vcc_lo, v0, s4
+; UNROLL3-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; UNROLL3-NEXT:    s_clause 0x2
+; UNROLL3-NEXT:    global_load_dwordx4 v[4:7], v[12:13], off offset:16
+; UNROLL3-NEXT:    global_load_dwordx4 v[8:11], v[12:13], off
+; UNROLL3-NEXT:    global_load_dwordx4 v[12:15], v[12:13], off offset:32
+; UNROLL3-NEXT:    s_add_u32 s4, s4, 48
+; UNROLL3-NEXT:    s_addc_u32 s5, s5, 0
+; UNROLL3-NEXT:    s_waitcnt vmcnt(2)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[4:7] offset:16
+; UNROLL3-NEXT:    s_waitcnt vmcnt(1)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[12:15] offset:32
+; UNROLL3-NEXT:    s_cmp_lg_u64 s[4:5], 0x7e0
+; UNROLL3-NEXT:    s_cbranch_scc1 .LBB7_2
+; UNROLL3-NEXT:  ; %bb.3: ; %memmove_fwd_residual
+; UNROLL3-NEXT:    s_clause 0x1
+; UNROLL3-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:2016
+; UNROLL3-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off offset:2032
+; UNROLL3-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; UNROLL3-NEXT:    s_waitcnt vmcnt(1)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:2016
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[0:1], v[8:11] offset:2032
+; UNROLL3-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; UNROLL3-NEXT:  .LBB7_4: ; %Flow4
+; UNROLL3-NEXT:    s_andn2_saveexec_b32 s8, s6
+; UNROLL3-NEXT:    s_cbranch_execz .LBB7_7
+; UNROLL3-NEXT:  ; %bb.5: ; %memmove_bwd_residual
+; UNROLL3-NEXT:    s_clause 0x1
+; UNROLL3-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:2032
+; UNROLL3-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off offset:2016
+; UNROLL3-NEXT:    s_movk_i32 s6, 0xffd0
+; UNROLL3-NEXT:    s_mov_b64 s[4:5], 0x7b0
+; UNROLL3-NEXT:    s_mov_b32 s7, -1
+; UNROLL3-NEXT:    s_waitcnt vmcnt(1)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:2032
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[0:1], v[8:11] offset:2016
+; UNROLL3-NEXT:    .p2align 6
+; UNROLL3-NEXT:  .LBB7_6: ; %memmove_bwd_loop
+; UNROLL3-NEXT:    ; =>This Inner Loop Header: Depth=1
+; UNROLL3-NEXT:    v_add_co_u32 v12, vcc_lo, v2, s4
+; UNROLL3-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, s5, v3, vcc_lo
+; UNROLL3-NEXT:    v_add_co_u32 v16, vcc_lo, v0, s4
+; UNROLL3-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; UNROLL3-NEXT:    s_clause 0x2
+; UNROLL3-NEXT:    global_load_dwordx4 v[4:7], v[12:13], off offset:16
+; UNROLL3-NEXT:    global_load_dwordx4 v[8:11], v[12:13], off
+; UNROLL3-NEXT:    global_load_dwordx4 v[12:15], v[12:13], off offset:32
+; UNROLL3-NEXT:    s_add_u32 s4, s4, 0xffffffd0
+; UNROLL3-NEXT:    s_addc_u32 s5, s5, -1
+; UNROLL3-NEXT:    s_waitcnt vmcnt(2)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[4:7] offset:16
+; UNROLL3-NEXT:    s_waitcnt vmcnt(1)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[12:15] offset:32
+; UNROLL3-NEXT:    s_cmp_eq_u64 s[4:5], s[6:7]
+; UNROLL3-NEXT:    s_cbranch_scc0 .LBB7_6
+; UNROLL3-NEXT:  .LBB7_7: ; %Flow5
+; UNROLL3-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; UNROLL3-NEXT:    s_waitcnt lgkmcnt(0)
+; UNROLL3-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  tail call void @llvm.memmove.p0.p4.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 2048, i1 false)
+  ret void
+}
+
+define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p5_p5_sz2048:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s4, exec_lo
+; CHECK-NEXT:    v_cmpx_ge_u32_e64 v1, v0
+; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s4
+; CHECK-NEXT:    s_cbranch_execz .LBB8_3
+; CHECK-NEXT:  ; %bb.1: ; %memmove_fwd_loop.preheader
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0x800
+; CHECK-NEXT:  .LBB8_2: ; %memmove_fwd_loop
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_clause 0x3e
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:252
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:248
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:244
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:240
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:236
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:232
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:228
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:224
+; CHECK-NEXT:    buffer_load_dword v10, v1, s[0:3], 0 offen offset:220
+; CHECK-NEXT:    buffer_load_dword v11, v1, s[0:3], 0 offen offset:216
+; CHECK-NEXT:    buffer_load_dword v12, v1, s[0:3], 0 offen offset:212
+; CHECK-NEXT:    buffer_load_dword v13, v1, s[0:3], 0 offen offset:208
+; CHECK-NEXT:    buffer_load_dword v14, v1, s[0:3], 0 offen offset:204
+; CHECK-NEXT:    buffer_load_dword v15, v1, s[0:3], 0 offen offset:200
+; CHECK-NEXT:    buffer_load_dword v16, v1, s[0:3], 0 offen offset:196
+; CHECK-NEXT:    buffer_load_dword v17, v1, s[0:3], 0 offen offset:192
+; CHECK-NEXT:    buffer_load_dword v18, v1, s[0:3], 0 offen offset:188
+; CHECK-NEXT:    buffer_load_dword v19, v1, s[0:3], 0 offen offset:184
+; CHECK-NEXT:    buffer_load_dword v20, v1, s[0:3], 0 offen offset:180
+; CHECK-NEXT:    buffer_load_dword v21, v1, s[0:3], 0 offen offset:176
+; CHECK-NEXT:    buffer_load_dword v22, v1, s[0:3], 0 offen offset:172
+; CHECK-NEXT:    buffer_load_dword v23, v1, s[0:3], 0 offen offset:168
+; CHECK-NEXT:    buffer_load_dword v24, v1, s[0:3], 0 offen offset:164
+; CHECK-NEXT:    buffer_load_dword v25, v1, s[0:3], 0 offen offset:160
+; CHECK-NEXT:    buffer_load_dword v26, v1, s[0:3], 0 offen offset:156
+; CHECK-NEXT:    buffer_load_dword v27, v1, s[0:3], 0 offen offset:152
+; CHECK-NEXT:    buffer_load_dword v28, v1, s[0:3], 0 offen offset:148
+; CHECK-NEXT:    buffer_load_dword v29, v1, s[0:3], 0 offen offset:144
+; CHECK-NEXT:    buffer_load_dword v30, v1, s[0:3], 0 offen offset:140
+; CHECK-NEXT:    buffer_load_dword v31, v1, s[0:3], 0 offen offset:136
+; CHECK-NEXT:    buffer_load_dword v32, v1, s[0:3], 0 offen offset:132
+; CHECK-NEXT:    buffer_load_dword v33, v1, s[0:3], 0 offen offset:128
+; CHECK-NEXT:    buffer_load_dword v34, v1, s[0:3], 0 offen offset:124
+; CHECK-NEXT:    buffer_load_dword v35, v1, s[0:3], 0 offen offset:120
+; CHECK-NEXT:    buffer_load_dword v36, v1, s[0:3], 0 offen offset:116
+; CHECK-NEXT:    buffer_load_dword v37, v1, s[0:3], 0 offen offset:112
+; CHECK-NEXT:    buffer_load_dword v38, v1, s[0:3], 0 offen offset:108
+; CHECK-NEXT:    buffer_load_dword v39, v1, s[0:3], 0 offen offset:104
+; CHECK-NEXT:    buffer_load_dword v48, v1, s[0:3], 0 offen offset:100
+; CHECK-NEXT:    buffer_load_dword v49, v1, s[0:3], 0 offen offset:96
+; CHECK-NEXT:    buffer_load_dword v50, v1, s[0:3], 0 offen offset:92
+; CHECK-NEXT:    buffer_load_dword v51, v1, s[0:3], 0 offen offset:88
+; CHECK-NEXT:    buffer_load_dword v52, v1, s[0:3], 0 offen offset:84
+; CHECK-NEXT:    buffer_load_dword v53, v1, s[0:3], 0 offen offset:80
+; CHECK-NEXT:    buffer_load_dword v54, v1, s[0:3], 0 offen offset:76
+; CHECK-NEXT:    buffer_load_dword v55, v1, s[0:3], 0 offen offset:72
+; CHECK-NEXT:    buffer_load_dword v64, v1, s[0:3], 0 offen offset:68
+; CHECK-NEXT:    buffer_load_dword v65, v1, s[0:3], 0 offen offset:64
+; CHECK-NEXT:    buffer_load_dword v66, v1, s[0:3], 0 offen offset:60
+; CHECK-NEXT:    buffer_load_dword v67, v1, s[0:3], 0 offen offset:56
+; CHECK-NEXT:    buffer_load_dword v68, v1, s[0:3], 0 offen offset:52
+; CHECK-NEXT:    buffer_load_dword v69, v1, s[0:3], 0 offen offset:48
+; CHECK-NEXT:    buffer_load_dword v70, v1, s[0:3], 0 offen offset:44
+; CHECK-NEXT:    buffer_load_dword v71, v1, s[0:3], 0 offen offset:40
+; CHECK-NEXT:    buffer_load_dword v80, v1, s[0:3], 0 offen offset:36
+; CHECK-NEXT:    buffer_load_dword v81, v1, s[0:3], 0 offen offset:32
+; CHECK-NEXT:    buffer_load_dword v82, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v83, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v84, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v85, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v86, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v87, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v96, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v97, v1, s[0:3], 0 offen
+; CHECK-NEXT:    v_add_nc_u32_e32 v1, 0x100, v1
+; CHECK-NEXT:    s_add_u32 s4, s4, 0xffffff00
+; CHECK-NEXT:    s_addc_u32 s5, s5, -1
+; CHECK-NEXT:    s_waitcnt vmcnt(62)
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:252
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:248
+; CHECK-NEXT:    s_waitcnt vmcnt(61)
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:244
+; CHECK-NEXT:    s_waitcnt vmcnt(60)
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:240
+; CHECK-NEXT:    s_waitcnt vmcnt(59)
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:236
+; CHECK-NEXT:    s_waitcnt vmcnt(58)
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:232
+; CHECK-NEXT:    s_waitcnt vmcnt(57)
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:228
+; CHECK-NEXT:    s_waitcnt vmcnt(56)
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:224
+; CHECK-NEXT:    s_waitcnt vmcnt(55)
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:220
+; CHECK-NEXT:    s_waitcnt vmcnt(54)
+; CHECK-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:216
+; CHECK-NEXT:    s_waitcnt vmcnt(53)
+; CHECK-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:212
+; CHECK-NEXT:    s_waitcnt vmcnt(52)
+; CHECK-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:208
+; CHECK-NEXT:    s_waitcnt vmcnt(51)
+; CHECK-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:204
+; CHECK-NEXT:    s_waitcnt vmcnt(50)
+; CHECK-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:200
+; CHECK-NEXT:    s_waitcnt vmcnt(49)
+; CHECK-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:196
+; CHECK-NEXT:    s_waitcnt vmcnt(48)
+; CHECK-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:192
+; CHECK-NEXT:    s_waitcnt vmcnt(47)
+; CHECK-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:188
+; CHECK-NEXT:    s_waitcnt vmcnt(46)
+; CHECK-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:184
+; CHECK-NEXT:    s_waitcnt vmcnt(45)
+; CHECK-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:180
+; CHECK-NEXT:    s_waitcnt vmcnt(44)
+; CHECK-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:176
+; CHECK-NEXT:    s_waitcnt vmcnt(43)
+; CHECK-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:172
+; CHECK-NEXT:    s_waitcnt vmcnt(42)
+; CHECK-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:168
+; CHECK-NEXT:    s_waitcnt vmcnt(41)
+; CHECK-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:164
+; CHECK-NEXT:    s_waitcnt vmcnt(40)
+; CHECK-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:160
+; CHECK-NEXT:    s_waitcnt vmcnt(39)
+; CHECK-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:156
+; CHECK-NEXT:    s_waitcnt vmcnt(38)
+; CHECK-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:152
+; CHECK-NEXT:    s_waitcnt vmcnt(37)
+; CHECK-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:148
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:144
+; CHECK-NEXT:    s_waitcnt vmcnt(35)
+; CHECK-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:140
+; CHECK-NEXT:    s_waitcnt vmcnt(34)
+; CHECK-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:136
+; CHECK-NEXT:    s_waitcnt vmcnt(33)
+; CHECK-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:132
+; CHECK-NEXT:    s_waitcnt vmcnt(32)
+; CHECK-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:128
+; CHECK-NEXT:    s_waitcnt vmcnt(31)
+; CHECK-NEXT:    buffer_store_dword v34, v0, s[0:3], 0 offen offset:124
+; CHECK-NEXT:    s_waitcnt vmcnt(30)
+; CHECK-NEXT:    buffer_store_dword v35, v0, s[0:3], 0 offen offset:120
+; CHECK-NEXT:    s_waitcnt vmcnt(29)
+; CHECK-NEXT:    buffer_store_dword v36, v0, s[0:3], 0 offen offset:116
+; CHECK-NEXT:    s_waitcnt vmcnt(28)
+; CHECK-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(27)
+; CHECK-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen offset:108
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_dword v39, v0, s[0:3], 0 offen offset:104
+; CHECK-NEXT:    s_waitcnt vmcnt(25)
+; CHECK-NEXT:    buffer_store_dword v48, v0, s[0:3], 0 offen offset:100
+; CHECK-NEXT:    s_waitcnt vmcnt(24)
+; CHECK-NEXT:    buffer_store_dword v49, v0, s[0:3], 0 offen offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(23)
+; CHECK-NEXT:    buffer_store_dword v50, v0, s[0:3], 0 offen offset:92
+; CHECK-NEXT:    s_waitcnt vmcnt(22)
+; CHECK-NEXT:    buffer_store_dword v51, v0, s[0:3], 0 offen offset:88
+; CHECK-NEXT:    s_waitcnt vmcnt(21)
+; CHECK-NEXT:    buffer_store_dword v52, v0, s[0:3], 0 offen offset:84
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
+; CHECK-NEXT:    buffer_store_dword v53, v0, s[0:3], 0 offen offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(19)
+; CHECK-NEXT:    buffer_store_dword v54, v0, s[0:3], 0 offen offset:76
+; CHECK-NEXT:    s_waitcnt vmcnt(18)
+; CHECK-NEXT:    buffer_store_dword v55, v0, s[0:3], 0 offen offset:72
+; CHECK-NEXT:    s_waitcnt vmcnt(17)
+; CHECK-NEXT:    buffer_store_dword v64, v0, s[0:3], 0 offen offset:68
+; CHECK-NEXT:    s_waitcnt vmcnt(16)
+; CHECK-NEXT:    buffer_store_dword v65, v0, s[0:3], 0 offen offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(15)
+; CHECK-NEXT:    buffer_store_dword v66, v0, s[0:3], 0 offen offset:60
+; CHECK-NEXT:    s_waitcnt vmcnt(14)
+; CHECK-NEXT:    buffer_store_dword v67, v0, s[0:3], 0 offen offset:56
+; CHECK-NEXT:    s_waitcnt vmcnt(13)
+; CHECK-NEXT:    buffer_store_dword v68, v0, s[0:3], 0 offen offset:52
+; CHECK-NEXT:    s_waitcnt vmcnt(12)
+; CHECK-NEXT:    buffer_store_dword v69, v0, s[0:3], 0 offen offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(11)
+; CHECK-NEXT:    buffer_store_dword v70, v0, s[0:3], 0 offen offset:44
+; CHECK-NEXT:    s_waitcnt vmcnt(10)
+; CHECK-NEXT:    buffer_store_dword v71, v0, s[0:3], 0 offen offset:40
+; CHECK-NEXT:    s_waitcnt vmcnt(9)
+; CHECK-NEXT:    buffer_store_dword v80, v0, s[0:3], 0 offen offset:36
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
+; CHECK-NEXT:    buffer_store_dword v81, v0, s[0:3], 0 offen offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
+; CHECK-NEXT:    buffer_store_dword v82, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
+; CHECK-NEXT:    buffer_store_dword v83, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    buffer_store_dword v84, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    buffer_store_dword v85, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    buffer_store_dword v86, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    buffer_store_dword v87, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    buffer_store_dword v96, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    buffer_store_dword v97, v0, s[0:3], 0 offen
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, 0x100, v0
+; CHECK-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB8_2
+; CHECK-NEXT:  .LBB8_3: ; %Flow18
+; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s6
+; CHECK-NEXT:    s_cbranch_execz .LBB8_6
+; CHECK-NEXT:  ; %bb.4: ; %memmove_bwd_loop.preheader
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, 0x700, v0
+; CHECK-NEXT:    v_add_nc_u32_e32 v1, 0x700, v1
+; CHECK-NEXT:    s_movk_i32 s4, 0xf800
+; CHECK-NEXT:    s_mov_b32 s5, -1
+; CHECK-NEXT:  .LBB8_5: ; %memmove_bwd_loop
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_clause 0x3e
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:252
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:248
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:244
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:240
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:236
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:232
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:228
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:224
+; CHECK-NEXT:    buffer_load_dword v10, v1, s[0:3], 0 offen offset:220
+; CHECK-NEXT:    buffer_load_dword v11, v1, s[0:3], 0 offen offset:216
+; CHECK-NEXT:    buffer_load_dword v12, v1, s[0:3], 0 offen offset:212
+; CHECK-NEXT:    buffer_load_dword v13, v1, s[0:3], 0 offen offset:208
+; CHECK-NEXT:    buffer_load_dword v14, v1, s[0:3], 0 offen offset:204
+; CHECK-NEXT:    buffer_load_dword v15, v1, s[0:3], 0 offen offset:200
+; CHECK-NEXT:    buffer_load_dword v16, v1, s[0:3], 0 offen offset:196
+; CHECK-NEXT:    buffer_load_dword v17, v1, s[0:3], 0 offen offset:192
+; CHECK-NEXT:    buffer_load_dword v18, v1, s[0:3], 0 offen offset:188
+; CHECK-NEXT:    buffer_load_dword v19, v1, s[0:3], 0 offen offset:184
+; CHECK-NEXT:    buffer_load_dword v20, v1, s[0:3], 0 offen offset:180
+; CHECK-NEXT:    buffer_load_dword v21, v1, s[0:3], 0 offen offset:176
+; CHECK-NEXT:    buffer_load_dword v22, v1, s[0:3], 0 offen offset:172
+; CHECK-NEXT:    buffer_load_dword v23, v1, s[0:3], 0 offen offset:168
+; CHECK-NEXT:    buffer_load_dword v24, v1, s[0:3], 0 offen offset:164
+; CHECK-NEXT:    buffer_load_dword v25, v1, s[0:3], 0 offen offset:160
+; CHECK-NEXT:    buffer_load_dword v26, v1, s[0:3], 0 offen offset:156
+; CHECK-NEXT:    buffer_load_dword v27, v1, s[0:3], 0 offen offset:152
+; CHECK-NEXT:    buffer_load_dword v28, v1, s[0:3], 0 offen offset:148
+; CHECK-NEXT:    buffer_load_dword v29, v1, s[0:3], 0 offen offset:144
+; CHECK-NEXT:    buffer_load_dword v30, v1, s[0:3], 0 offen offset:140
+; CHECK-NEXT:    buffer_load_dword v31, v1, s[0:3], 0 offen offset:136
+; CHECK-NEXT:    buffer_load_dword v32, v1, s[0:3], 0 offen offset:132
+; CHECK-NEXT:    buffer_load_dword v33, v1, s[0:3], 0 offen offset:128
+; CHECK-NEXT:    buffer_load_dword v34, v1, s[0:3], 0 offen offset:124
+; CHECK-NEXT:    buffer_load_dword v35, v1, s[0:3], 0 offen offset:120
+; CHECK-NEXT:    buffer_load_dword v36, v1, s[0:3], 0 offen offset:116
+; CHECK-NEXT:    buffer_load_dword v37, v1, s[0:3], 0 offen offset:112
+; CHECK-NEXT:    buffer_load_dword v38, v1, s[0:3], 0 offen offset:108
+; CHECK-NEXT:    buffer_load_dword v39, v1, s[0:3], 0 offen offset:104
+; CHECK-NEXT:    buffer_load_dword v48, v1, s[0:3], 0 offen offset:100
+; CHECK-NEXT:    buffer_load_dword v49, v1, s[0:3], 0 offen offset:96
+; CHECK-NEXT:    buffer_load_dword v50, v1, s[0:3], 0 offen offset:92
+; CHECK-NEXT:    buffer_load_dword v51, v1, s[0:3], 0 offen offset:88
+; CHECK-NEXT:    buffer_load_dword v52, v1, s[0:3], 0 offen offset:84
+; CHECK-NEXT:    buffer_load_dword v53, v1, s[0:3], 0 offen offset:80
+; CHECK-NEXT:    buffer_load_dword v54, v1, s[0:3], 0 offen offset:76
+; CHECK-NEXT:    buffer_load_dword v55, v1, s[0:3], 0 offen offset:72
+; CHECK-NEXT:    buffer_load_dword v64, v1, s[0:3], 0 offen offset:68
+; CHECK-NEXT:    buffer_load_dword v65, v1, s[0:3], 0 offen offset:64
+; CHECK-NEXT:    buffer_load_dword v66, v1, s[0:3], 0 offen offset:60
+; CHECK-NEXT:    buffer_load_dword v67, v1, s[0:3], 0 offen offset:56
+; CHECK-NEXT:    buffer_load_dword v68, v1, s[0:3], 0 offen offset:52
+; CHECK-NEXT:    buffer_load_dword v69, v1, s[0:3], 0 offen offset:48
+; CHECK-NEXT:    buffer_load_dword v70, v1, s[0:3], 0 offen offset:44
+; CHECK-NEXT:    buffer_load_dword v71, v1, s[0:3], 0 offen offset:40
+; CHECK-NEXT:    buffer_load_dword v80, v1, s[0:3], 0 offen offset:36
+; CHECK-NEXT:    buffer_load_dword v81, v1, s[0:3], 0 offen offset:32
+; CHECK-NEXT:    buffer_load_dword v82, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v83, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v84, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v85, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v86, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v87, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v96, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v97, v1, s[0:3], 0 offen
+; CHECK-NEXT:    v_add_nc_u32_e32 v1, 0xffffff00, v1
+; CHECK-NEXT:    s_add_u32 s4, s4, 0x100
+; CHECK-NEXT:    s_addc_u32 s5, s5, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(62)
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:252
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:248
+; CHECK-NEXT:    s_waitcnt vmcnt(61)
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:244
+; CHECK-NEXT:    s_waitcnt vmcnt(60)
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:240
+; CHECK-NEXT:    s_waitcnt vmcnt(59)
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:236
+; CHECK-NEXT:    s_waitcnt vmcnt(58)
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:232
+; CHECK-NEXT:    s_waitcnt vmcnt(57)
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:228
+; CHECK-NEXT:    s_waitcnt vmcnt(56)
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:224
+; CHECK-NEXT:    s_waitcnt vmcnt(55)
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:220
+; CHECK-NEXT:    s_waitcnt vmcnt(54)
+; CHECK-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:216
+; CHECK-NEXT:    s_waitcnt vmcnt(53)
+; CHECK-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:212
+; CHECK-NEXT:    s_waitcnt vmcnt(52)
+; CHECK-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:208
+; CHECK-NEXT:    s_waitcnt vmcnt(51)
+; CHECK-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:204
+; CHECK-NEXT:    s_waitcnt vmcnt(50)
+; CHECK-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:200
+; CHECK-NEXT:    s_waitcnt vmcnt(49)
+; CHECK-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:196
+; CHECK-NEXT:    s_waitcnt vmcnt(48)
+; CHECK-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:192
+; CHECK-NEXT:    s_waitcnt vmcnt(47)
+; CHECK-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:188
+; CHECK-NEXT:    s_waitcnt vmcnt(46)
+; CHECK-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:184
+; CHECK-NEXT:    s_waitcnt vmcnt(45)
+; CHECK-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:180
+; CHECK-NEXT:    s_waitcnt vmcnt(44)
+; CHECK-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:176
+; CHECK-NEXT:    s_waitcnt vmcnt(43)
+; CHECK-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:172
+; CHECK-NEXT:    s_waitcnt vmcnt(42)
+; CHECK-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:168
+; CHECK-NEXT:    s_waitcnt vmcnt(41)
+; CHECK-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:164
+; CHECK-NEXT:    s_waitcnt vmcnt(40)
+; CHECK-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:160
+; CHECK-NEXT:    s_waitcnt vmcnt(39)
+; CHECK-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:156
+; CHECK-NEXT:    s_waitcnt vmcnt(38)
+; CHECK-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:152
+; CHECK-NEXT:    s_waitcnt vmcnt(37)
+; CHECK-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:148
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:144
+; CHECK-NEXT:    s_waitcnt vmcnt(35)
+; CHECK-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:140
+; CHECK-NEXT:    s_waitcnt vmcnt(34)
+; CHECK-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:136
+; CHECK-NEXT:    s_waitcnt vmcnt(33)
+; CHECK-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:132
+; CHECK-NEXT:    s_waitcnt vmcnt(32)
+; CHECK-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:128
+; CHECK-NEXT:    s_waitcnt vmcnt(31)
+; CHECK-NEXT:    buffer_store_dword v34, v0, s[0:3], 0 offen offset:124
+; CHECK-NEXT:    s_waitcnt vmcnt(30)
+; CHECK-NEXT:    buffer_store_dword v35, v0, s[0:3], 0 offen offset:120
+; CHECK-NEXT:    s_waitcnt vmcnt(29)
+; CHECK-NEXT:    buffer_store_dword v36, v0, s[0:3], 0 offen offset:116
+; CHECK-NEXT:    s_waitcnt vmcnt(28)
+; CHECK-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(27)
+; CHECK-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen offset:108
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_dword v39, v0, s[0:3], 0 offen offset:104
+; CHECK-NEXT:    s_waitcnt vmcnt(25)
+; CHECK-NEXT:    buffer_store_dword v48, v0, s[0:3], 0 offen offset:100
+; CHECK-NEXT:    s_waitcnt vmcnt(24)
+; CHECK-NEXT:    buffer_store_dword v49, v0, s[0:3], 0 offen offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(23)
+; CHECK-NEXT:    buffer_store_dword v50, v0, s[0:3], 0 offen offset:92
+; CHECK-NEXT:    s_waitcnt vmcnt(22)
+; CHECK-NEXT:    buffer_store_dword v51, v0, s[0:3], 0 offen offset:88
+; CHECK-NEXT:    s_waitcnt vmcnt(21)
+; CHECK-NEXT:    buffer_store_dword v52, v0, s[0:3], 0 offen offset:84
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
+; CHECK-NEXT:    buffer_store_dword v53, v0, s[0:3], 0 offen offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(19)
+; CHECK-NEXT:    buffer_store_dword v54, v0, s[0:3], 0 offen offset:76
+; CHECK-NEXT:    s_waitcnt vmcnt(18)
+; CHECK-NEXT:    buffer_store_dword v55, v0, s[0:3], 0 offen offset:72
+; CHECK-NEXT:    s_waitcnt vmcnt(17)
+; CHECK-NEXT:    buffer_store_dword v64, v0, s[0:3], 0 offen offset:68
+; CHECK-NEXT:    s_waitcnt vmcnt(16)
+; CHECK-NEXT:    buffer_store_dword v65, v0, s[0:3], 0 offen offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(15)
+; CHECK-NEXT:    buffer_store_dword v66, v0, s[0:3], 0 offen offset:60
+; CHECK-NEXT:    s_waitcnt vmcnt(14)
+; CHECK-NEXT:    buffer_store_dword v67, v0, s[0:3], 0 offen offset:56
+; CHECK-NEXT:    s_waitcnt vmcnt(13)
+; CHECK-NEXT:    buffer_store_dword v68, v0, s[0:3], 0 offen offset:52
+; CHECK-NEXT:    s_waitcnt vmcnt(12)
+; CHECK-NEXT:    buffer_store_dword v69, v0, s[0:3], 0 offen offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(11)
+; CHECK-NEXT:    buffer_store_dword v70, v0, s[0:3], 0 offen offset:44
+; CHECK-NEXT:    s_waitcnt vmcnt(10)
+; CHECK-NEXT:    buffer_store_dword v71, v0, s[0:3], 0 offen offset:40
+; CHECK-NEXT:    s_waitcnt vmcnt(9)
+; CHECK-NEXT:    buffer_store_dword v80, v0, s[0:3], 0 offen offset:36
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
+; CHECK-NEXT:    buffer_store_dword v81, v0, s[0:3], 0 offen offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
+; CHECK-NEXT:    buffer_store_dword v82, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
+; CHECK-NEXT:    buffer_store_dword v83, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    buffer_store_dword v84, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    buffer_store_dword v85, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    buffer_store_dword v86, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    buffer_store_dword v87, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    buffer_store_dword v96, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    buffer_store_dword v97, v0, s[0:3], 0 offen
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, 0xffffff00, v0
+; CHECK-NEXT:    s_cmp_eq_u64 s[4:5], 0
+; CHECK-NEXT:    s_cbranch_scc0 .LBB8_5
+; CHECK-NEXT:  .LBB8_6: ; %Flow19
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; ALIGNED-LABEL: memmove_p5_p5_sz2048:
+; ALIGNED:       ; %bb.0: ; %entry
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v72, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v73, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v74, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v75, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v76, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v77, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v78, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v79, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v88, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v89, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v90, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v91, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v92, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v93, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v94, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v95, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v104, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v105, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v106, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v107, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v108, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v109, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v110, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v111, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v120, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v121, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v122, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v123, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v124, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v125, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v126, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_mov_b32 s4, exec_lo
+; ALIGNED-NEXT:    v_cmpx_ge_u32_e64 v1, v0
+; ALIGNED-NEXT:    s_xor_b32 s6, exec_lo, s4
+; ALIGNED-NEXT:    s_cbranch_execz .LBB8_3
+; ALIGNED-NEXT:  ; %bb.1: ; %memmove_fwd_loop.preheader
+; ALIGNED-NEXT:    s_mov_b64 s[4:5], 0x800
+; ALIGNED-NEXT:  .LBB8_2: ; %memmove_fwd_loop
+; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT:    s_add_u32 s4, s4, 0xffffff00
+; ALIGNED-NEXT:    s_addc_u32 s5, s5, -1
+; ALIGNED-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:251
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x3e
+; ALIGNED-NEXT:    buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:247
+; ALIGNED-NEXT:    buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT:    buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT:    buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT:    buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT:    buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT:    buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT:    buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT:    buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT:    buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT:    buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT:    buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT:    buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT:    buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT:    buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT:    buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT:    buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT:    buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT:    buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT:    buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT:    buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT:    buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT:    buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT:    buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT:    buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT:    buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT:    buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT:    buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT:    buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT:    buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT:    buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT:    buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT:    buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT:    buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT:    buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT:    buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT:    buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT:    buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT:    buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT:    buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT:    buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT:    buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT:    buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT:    buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT:    buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT:    buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT:    buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT:    buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT:    buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT:    buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT:    buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT:    buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT:    buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT:    buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT:    buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT:    buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT:    buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT:    buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT:    buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT:    buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT:    buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT:    buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT:    buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT:    s_clause 0x3a
+; ALIGNED-NEXT:    buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT:    buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT:    buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT:    buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT:    buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT:    buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT:    buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT:    buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT:    buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT:    buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT:    buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT:    buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT:    buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT:    buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT:    buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT:    buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT:    buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT:    buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT:    buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT:    buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT:    buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT:    buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT:    buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT:    buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT:    buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT:    buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT:    buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT:    buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:157
+; ALIGNED-NEXT:    buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:156
+; ALIGNED-NEXT:    buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT:    buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT:    buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:151
+; ALIGNED-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:150
+; ALIGNED-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:149
+; ALIGNED-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:148
+; ALIGNED-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:147
+; ALIGNED-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:146
+; ALIGNED-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:145
+; ALIGNED-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:144
+; ALIGNED-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:143
+; ALIGNED-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:142
+; ALIGNED-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:141
+; ALIGNED-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:140
+; ALIGNED-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:139
+; ALIGNED-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:138
+; ALIGNED-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:137
+; ALIGNED-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:136
+; ALIGNED-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:135
+; ALIGNED-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:134
+; ALIGNED-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:133
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:132
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:131
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:130
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:129
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:128
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:127
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:125
+; ALIGNED-NEXT:    buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:124
+; ALIGNED-NEXT:    buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:123
+; ALIGNED-NEXT:    buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:122
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:121
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:120
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:119
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:118
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:117
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:116
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:115
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:114
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:113
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:112
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:111
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:110
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:109
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:108
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:107
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:106
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:105
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:104
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:103
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:102
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:101
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:100
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:99
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:98
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:97
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:96
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:95
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:94
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:93
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:92
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:91
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:90
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:89
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:88
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:87
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:86
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:83
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:82
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:81
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:80
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:79
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:78
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:77
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:76
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:74
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:72
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:71
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:70
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:69
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:68
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:67
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:66
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:65
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:64
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:63
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:62
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:61
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:60
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:59
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:55
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:54
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:53
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:52
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:51
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:50
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:49
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:48
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:47
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:46
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:45
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:44
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:43
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:42
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:41
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:40
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:39
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:38
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:37
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:36
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:35
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:34
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:33
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:32
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:29
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:28
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:23
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:22
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:21
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:20
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:18
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:16
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:11
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
+; ALIGNED-NEXT:    v_add_nc_u32_e32 v1, 0x100, v1
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:251
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT:    buffer_store_byte v123, v0, s[0:3], 0 offen offset:247
+; ALIGNED-NEXT:    buffer_store_byte v122, v0, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT:    buffer_store_byte v121, v0, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT:    buffer_store_byte v120, v0, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT:    buffer_store_byte v111, v0, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT:    buffer_store_byte v110, v0, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT:    buffer_store_byte v109, v0, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT:    buffer_store_byte v108, v0, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT:    buffer_store_byte v107, v0, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT:    buffer_store_byte v106, v0, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT:    buffer_store_byte v105, v0, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT:    buffer_store_byte v104, v0, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT:    buffer_store_byte v95, v0, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT:    buffer_store_byte v94, v0, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT:    buffer_store_byte v93, v0, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT:    buffer_store_byte v92, v0, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT:    buffer_store_byte v91, v0, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT:    buffer_store_byte v90, v0, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT:    buffer_store_byte v89, v0, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT:    buffer_store_byte v88, v0, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT:    buffer_store_byte v79, v0, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT:    buffer_store_byte v78, v0, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT:    buffer_store_byte v77, v0, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT:    buffer_store_byte v76, v0, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT:    buffer_store_byte v75, v0, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT:    buffer_store_byte v74, v0, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT:    buffer_store_byte v73, v0, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT:    buffer_store_byte v72, v0, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT:    buffer_store_byte v63, v0, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT:    buffer_store_byte v62, v0, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT:    buffer_store_byte v61, v0, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT:    buffer_store_byte v60, v0, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT:    buffer_store_byte v59, v0, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT:    buffer_store_byte v58, v0, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT:    buffer_store_byte v57, v0, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT:    buffer_store_byte v56, v0, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT:    buffer_store_byte v47, v0, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT:    buffer_store_byte v46, v0, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT:    buffer_store_byte v45, v0, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT:    buffer_store_byte v44, v0, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT:    buffer_store_byte v43, v0, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT:    buffer_store_byte v42, v0, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT:    buffer_store_byte v41, v0, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT:    buffer_store_byte v40, v0, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT:    buffer_store_byte v119, v0, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT:    buffer_store_byte v118, v0, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT:    buffer_store_byte v117, v0, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT:    buffer_store_byte v116, v0, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT:    buffer_store_byte v115, v0, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT:    buffer_store_byte v114, v0, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT:    buffer_store_byte v113, v0, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT:    buffer_store_byte v112, v0, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT:    buffer_store_byte v103, v0, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT:    buffer_store_byte v102, v0, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT:    buffer_store_byte v101, v0, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT:    buffer_store_byte v100, v0, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT:    buffer_store_byte v99, v0, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT:    buffer_store_byte v98, v0, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT:    buffer_store_byte v97, v0, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT:    buffer_store_byte v96, v0, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT:    buffer_store_byte v87, v0, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT:    buffer_store_byte v86, v0, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT:    buffer_store_byte v85, v0, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT:    buffer_store_byte v84, v0, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT:    buffer_store_byte v83, v0, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT:    buffer_store_byte v82, v0, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT:    buffer_store_byte v81, v0, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT:    buffer_store_byte v80, v0, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT:    buffer_store_byte v71, v0, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT:    buffer_store_byte v70, v0, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT:    buffer_store_byte v69, v0, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT:    buffer_store_byte v68, v0, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT:    buffer_store_byte v67, v0, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT:    buffer_store_byte v66, v0, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT:    buffer_store_byte v65, v0, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT:    buffer_store_byte v64, v0, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT:    buffer_store_byte v55, v0, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT:    buffer_store_byte v54, v0, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT:    buffer_store_byte v53, v0, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT:    buffer_store_byte v52, v0, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT:    buffer_store_byte v51, v0, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT:    buffer_store_byte v50, v0, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT:    buffer_store_byte v49, v0, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT:    buffer_store_byte v48, v0, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT:    buffer_store_byte v39, v0, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT:    buffer_store_byte v38, v0, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT:    buffer_store_byte v37, v0, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT:    buffer_store_byte v36, v0, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT:    buffer_store_byte v35, v0, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT:    buffer_store_byte v34, v0, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT:    buffer_store_byte v33, v0, s[0:3], 0 offen offset:157
+; ALIGNED-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:156
+; ALIGNED-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:151
+; ALIGNED-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:150
+; ALIGNED-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:149
+; ALIGNED-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:148
+; ALIGNED-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:147
+; ALIGNED-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:146
+; ALIGNED-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:145
+; ALIGNED-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:144
+; ALIGNED-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:143
+; ALIGNED-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:142
+; ALIGNED-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:141
+; ALIGNED-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:140
+; ALIGNED-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:139
+; ALIGNED-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:138
+; ALIGNED-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:137
+; ALIGNED-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:136
+; ALIGNED-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:135
+; ALIGNED-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:134
+; ALIGNED-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:133
+; ALIGNED-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:132
+; ALIGNED-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:131
+; ALIGNED-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:130
+; ALIGNED-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:129
+; ALIGNED-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:128
+; ALIGNED-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:127
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:126
+; ALIGNED-NEXT:    buffer_store_byte v127, v0, s[0:3], 0 offen offset:125
+; ALIGNED-NEXT:    buffer_store_byte v126, v0, s[0:3], 0 offen offset:124
+; ALIGNED-NEXT:    buffer_store_byte v125, v0, s[0:3], 0 offen offset:123
+; ALIGNED-NEXT:    buffer_store_byte v124, v0, s[0:3], 0 offen offset:122
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:121
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:120
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:119
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:118
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:117
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:116
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:115
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:114
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:113
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:112
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:111
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:110
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:109
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:108
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:107
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:106
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:105
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:104
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:103
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:102
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:101
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:100
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:99
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:98
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:97
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:96
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:95
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:94
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:93
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:92
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:91
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:90
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:89
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:88
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:87
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:86
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:83
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:82
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:81
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:80
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:79
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:78
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:77
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:76
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:74
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:73
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:72
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:71
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:70
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:69
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:68
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:67
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:66
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:65
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:64
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:63
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:62
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:61
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:60
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:59
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:55
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:54
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:53
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:52
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:51
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:50
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:49
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:48
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:47
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:46
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:45
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:44
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:43
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:42
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:41
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:40
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:39
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:38
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:37
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:36
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:35
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:34
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:33
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:32
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:31
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:29
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:28
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:27
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:24
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:23
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:22
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:21
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:20
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:18
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:11
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:1
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen
+; ALIGNED-NEXT:    v_add_nc_u32_e32 v0, 0x100, v0
+; ALIGNED-NEXT:    s_cbranch_scc1 .LBB8_2
+; ALIGNED-NEXT:  .LBB8_3: ; %Flow18
+; ALIGNED-NEXT:    s_andn2_saveexec_b32 s6, s6
+; ALIGNED-NEXT:    s_cbranch_execz .LBB8_6
+; ALIGNED-NEXT:  ; %bb.4: ; %memmove_bwd_loop.preheader
+; ALIGNED-NEXT:    v_add_nc_u32_e32 v0, 0x700, v0
+; ALIGNED-NEXT:    v_add_nc_u32_e32 v1, 0x700, v1
+; ALIGNED-NEXT:    s_movk_i32 s4, 0xf800
+; ALIGNED-NEXT:    s_mov_b32 s5, -1
+; ALIGNED-NEXT:  .LBB8_5: ; %memmove_bwd_loop
+; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT:    s_add_u32 s4, s4, 0x100
+; ALIGNED-NEXT:    s_addc_u32 s5, s5, 0
+; ALIGNED-NEXT:    s_cmp_eq_u64 s[4:5], 0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:251
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:247
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x3e
+; ALIGNED-NEXT:    buffer_load_ubyte v115, v1, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT:    buffer_load_ubyte v114, v1, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT:    buffer_load_ubyte v113, v1, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT:    buffer_load_ubyte v112, v1, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT:    buffer_load_ubyte v103, v1, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT:    buffer_load_ubyte v102, v1, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT:    buffer_load_ubyte v101, v1, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT:    buffer_load_ubyte v100, v1, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT:    buffer_load_ubyte v99, v1, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT:    buffer_load_ubyte v98, v1, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT:    buffer_load_ubyte v97, v1, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT:    buffer_load_ubyte v96, v1, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT:    buffer_load_ubyte v87, v1, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT:    buffer_load_ubyte v86, v1, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT:    buffer_load_ubyte v85, v1, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT:    buffer_load_ubyte v84, v1, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT:    buffer_load_ubyte v83, v1, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT:    buffer_load_ubyte v82, v1, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT:    buffer_load_ubyte v81, v1, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT:    buffer_load_ubyte v80, v1, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT:    buffer_load_ubyte v71, v1, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT:    buffer_load_ubyte v70, v1, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT:    buffer_load_ubyte v69, v1, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT:    buffer_load_ubyte v68, v1, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT:    buffer_load_ubyte v67, v1, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT:    buffer_load_ubyte v66, v1, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT:    buffer_load_ubyte v65, v1, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT:    buffer_load_ubyte v64, v1, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT:    buffer_load_ubyte v55, v1, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT:    buffer_load_ubyte v54, v1, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT:    buffer_load_ubyte v53, v1, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT:    buffer_load_ubyte v52, v1, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT:    buffer_load_ubyte v51, v1, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT:    buffer_load_ubyte v50, v1, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT:    buffer_load_ubyte v49, v1, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT:    buffer_load_ubyte v48, v1, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT:    buffer_load_ubyte v39, v1, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT:    buffer_load_ubyte v38, v1, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT:    buffer_load_ubyte v37, v1, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT:    buffer_load_ubyte v36, v1, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT:    buffer_load_ubyte v35, v1, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT:    buffer_load_ubyte v34, v1, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT:    buffer_load_ubyte v33, v1, s[0:3], 0 offen offset:157
+; ALIGNED-NEXT:    buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:156
+; ALIGNED-NEXT:    buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT:    buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT:    buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:151
+; ALIGNED-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:150
+; ALIGNED-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:149
+; ALIGNED-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:148
+; ALIGNED-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:147
+; ALIGNED-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:146
+; ALIGNED-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:145
+; ALIGNED-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:144
+; ALIGNED-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:143
+; ALIGNED-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:142
+; ALIGNED-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:141
+; ALIGNED-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:140
+; ALIGNED-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:139
+; ALIGNED-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:138
+; ALIGNED-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:137
+; ALIGNED-NEXT:    s_clause 0xa
+; ALIGNED-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:136
+; ALIGNED-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:135
+; ALIGNED-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:134
+; ALIGNED-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:133
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:132
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:131
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:130
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:129
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:128
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:127
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:126
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x34
+; ALIGNED-NEXT:    buffer_load_ubyte v127, v1, s[0:3], 0 offen offset:125
+; ALIGNED-NEXT:    buffer_load_ubyte v126, v1, s[0:3], 0 offen offset:124
+; ALIGNED-NEXT:    buffer_load_ubyte v125, v1, s[0:3], 0 offen offset:123
+; ALIGNED-NEXT:    buffer_load_ubyte v124, v1, s[0:3], 0 offen offset:122
+; ALIGNED-NEXT:    buffer_load_ubyte v123, v1, s[0:3], 0 offen offset:121
+; ALIGNED-NEXT:    buffer_load_ubyte v122, v1, s[0:3], 0 offen offset:120
+; ALIGNED-NEXT:    buffer_load_ubyte v121, v1, s[0:3], 0 offen offset:119
+; ALIGNED-NEXT:    buffer_load_ubyte v120, v1, s[0:3], 0 offen offset:118
+; ALIGNED-NEXT:    buffer_load_ubyte v111, v1, s[0:3], 0 offen offset:117
+; ALIGNED-NEXT:    buffer_load_ubyte v110, v1, s[0:3], 0 offen offset:116
+; ALIGNED-NEXT:    buffer_load_ubyte v109, v1, s[0:3], 0 offen offset:115
+; ALIGNED-NEXT:    buffer_load_ubyte v108, v1, s[0:3], 0 offen offset:114
+; ALIGNED-NEXT:    buffer_load_ubyte v107, v1, s[0:3], 0 offen offset:113
+; ALIGNED-NEXT:    buffer_load_ubyte v106, v1, s[0:3], 0 offen offset:112
+; ALIGNED-NEXT:    buffer_load_ubyte v105, v1, s[0:3], 0 offen offset:111
+; ALIGNED-NEXT:    buffer_load_ubyte v104, v1, s[0:3], 0 offen offset:110
+; ALIGNED-NEXT:    buffer_load_ubyte v95, v1, s[0:3], 0 offen offset:109
+; ALIGNED-NEXT:    buffer_load_ubyte v94, v1, s[0:3], 0 offen offset:108
+; ALIGNED-NEXT:    buffer_load_ubyte v93, v1, s[0:3], 0 offen offset:107
+; ALIGNED-NEXT:    buffer_load_ubyte v92, v1, s[0:3], 0 offen offset:106
+; ALIGNED-NEXT:    buffer_load_ubyte v91, v1, s[0:3], 0 offen offset:105
+; ALIGNED-NEXT:    buffer_load_ubyte v90, v1, s[0:3], 0 offen offset:104
+; ALIGNED-NEXT:    buffer_load_ubyte v89, v1, s[0:3], 0 offen offset:103
+; ALIGNED-NEXT:    buffer_load_ubyte v88, v1, s[0:3], 0 offen offset:102
+; ALIGNED-NEXT:    buffer_load_ubyte v79, v1, s[0:3], 0 offen offset:101
+; ALIGNED-NEXT:    buffer_load_ubyte v78, v1, s[0:3], 0 offen offset:100
+; ALIGNED-NEXT:    buffer_load_ubyte v77, v1, s[0:3], 0 offen offset:99
+; ALIGNED-NEXT:    buffer_load_ubyte v76, v1, s[0:3], 0 offen offset:98
+; ALIGNED-NEXT:    buffer_load_ubyte v75, v1, s[0:3], 0 offen offset:97
+; ALIGNED-NEXT:    buffer_load_ubyte v74, v1, s[0:3], 0 offen offset:96
+; ALIGNED-NEXT:    buffer_load_ubyte v73, v1, s[0:3], 0 offen offset:95
+; ALIGNED-NEXT:    buffer_load_ubyte v72, v1, s[0:3], 0 offen offset:94
+; ALIGNED-NEXT:    buffer_load_ubyte v63, v1, s[0:3], 0 offen offset:93
+; ALIGNED-NEXT:    buffer_load_ubyte v62, v1, s[0:3], 0 offen offset:92
+; ALIGNED-NEXT:    buffer_load_ubyte v61, v1, s[0:3], 0 offen offset:91
+; ALIGNED-NEXT:    buffer_load_ubyte v60, v1, s[0:3], 0 offen offset:90
+; ALIGNED-NEXT:    buffer_load_ubyte v59, v1, s[0:3], 0 offen offset:89
+; ALIGNED-NEXT:    buffer_load_ubyte v58, v1, s[0:3], 0 offen offset:88
+; ALIGNED-NEXT:    buffer_load_ubyte v57, v1, s[0:3], 0 offen offset:87
+; ALIGNED-NEXT:    buffer_load_ubyte v56, v1, s[0:3], 0 offen offset:86
+; ALIGNED-NEXT:    buffer_load_ubyte v47, v1, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT:    buffer_load_ubyte v46, v1, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT:    buffer_load_ubyte v45, v1, s[0:3], 0 offen offset:83
+; ALIGNED-NEXT:    buffer_load_ubyte v44, v1, s[0:3], 0 offen offset:82
+; ALIGNED-NEXT:    buffer_load_ubyte v43, v1, s[0:3], 0 offen offset:81
+; ALIGNED-NEXT:    buffer_load_ubyte v42, v1, s[0:3], 0 offen offset:80
+; ALIGNED-NEXT:    buffer_load_ubyte v41, v1, s[0:3], 0 offen offset:79
+; ALIGNED-NEXT:    buffer_load_ubyte v40, v1, s[0:3], 0 offen offset:78
+; ALIGNED-NEXT:    buffer_load_ubyte v119, v1, s[0:3], 0 offen offset:77
+; ALIGNED-NEXT:    buffer_load_ubyte v118, v1, s[0:3], 0 offen offset:76
+; ALIGNED-NEXT:    buffer_load_ubyte v117, v1, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT:    buffer_load_ubyte v116, v1, s[0:3], 0 offen offset:74
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:73
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:72
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:71
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:70
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:69
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:68
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:67
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:66
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:65
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:64
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:63
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:62
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:61
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:60
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:59
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:55
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:54
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:53
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:52
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:51
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:50
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:49
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:48
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:47
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:46
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:45
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:44
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:43
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:42
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:41
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:40
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:39
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:38
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:37
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:36
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:35
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:34
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:33
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:32
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:29
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:28
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:23
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:22
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:21
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:20
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:18
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:16
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:11
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:1
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
+; ALIGNED-NEXT:    v_add_nc_u32_e32 v1, 0xffffff00, v1
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:251
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:247
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT:    buffer_store_byte v115, v0, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT:    buffer_store_byte v114, v0, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT:    buffer_store_byte v113, v0, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT:    buffer_store_byte v112, v0, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT:    buffer_store_byte v103, v0, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT:    buffer_store_byte v102, v0, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT:    buffer_store_byte v101, v0, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT:    buffer_store_byte v100, v0, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT:    buffer_store_byte v99, v0, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT:    buffer_store_byte v98, v0, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT:    buffer_store_byte v97, v0, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT:    buffer_store_byte v96, v0, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT:    buffer_store_byte v87, v0, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT:    buffer_store_byte v86, v0, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT:    buffer_store_byte v85, v0, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT:    buffer_store_byte v84, v0, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT:    buffer_store_byte v83, v0, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT:    buffer_store_byte v82, v0, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT:    buffer_store_byte v81, v0, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT:    buffer_store_byte v80, v0, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT:    buffer_store_byte v71, v0, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT:    buffer_store_byte v70, v0, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT:    buffer_store_byte v69, v0, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT:    buffer_store_byte v68, v0, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT:    buffer_store_byte v67, v0, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT:    buffer_store_byte v66, v0, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT:    buffer_store_byte v65, v0, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT:    buffer_store_byte v64, v0, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT:    buffer_store_byte v55, v0, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT:    buffer_store_byte v54, v0, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT:    buffer_store_byte v53, v0, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT:    buffer_store_byte v52, v0, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT:    buffer_store_byte v51, v0, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT:    buffer_store_byte v50, v0, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT:    buffer_store_byte v49, v0, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT:    buffer_store_byte v48, v0, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT:    buffer_store_byte v39, v0, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT:    buffer_store_byte v38, v0, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT:    buffer_store_byte v37, v0, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT:    buffer_store_byte v36, v0, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT:    buffer_store_byte v35, v0, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT:    buffer_store_byte v34, v0, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT:    buffer_store_byte v33, v0, s[0:3], 0 offen offset:157
+; ALIGNED-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:156
+; ALIGNED-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:151
+; ALIGNED-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:150
+; ALIGNED-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:149
+; ALIGNED-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:148
+; ALIGNED-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:147
+; ALIGNED-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:146
+; ALIGNED-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:145
+; ALIGNED-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:144
+; ALIGNED-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:143
+; ALIGNED-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:142
+; ALIGNED-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:141
+; ALIGNED-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:140
+; ALIGNED-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:139
+; ALIGNED-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:138
+; ALIGNED-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:137
+; ALIGNED-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:136
+; ALIGNED-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:135
+; ALIGNED-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:134
+; ALIGNED-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:133
+; ALIGNED-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:132
+; ALIGNED-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:131
+; ALIGNED-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:130
+; ALIGNED-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:129
+; ALIGNED-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:128
+; ALIGNED-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:127
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:126
+; ALIGNED-NEXT:    buffer_store_byte v127, v0, s[0:3], 0 offen offset:125
+; ALIGNED-NEXT:    buffer_store_byte v126, v0, s[0:3], 0 offen offset:124
+; ALIGNED-NEXT:    buffer_store_byte v125, v0, s[0:3], 0 offen offset:123
+; ALIGNED-NEXT:    buffer_store_byte v124, v0, s[0:3], 0 offen offset:122
+; ALIGNED-NEXT:    buffer_store_byte v123, v0, s[0:3], 0 offen offset:121
+; ALIGNED-NEXT:    buffer_store_byte v122, v0, s[0:3], 0 offen offset:120
+; ALIGNED-NEXT:    buffer_store_byte v121, v0, s[0:3], 0 offen offset:119
+; ALIGNED-NEXT:    buffer_store_byte v120, v0, s[0:3], 0 offen offset:118
+; ALIGNED-NEXT:    buffer_store_byte v111, v0, s[0:3], 0 offen offset:117
+; ALIGNED-NEXT:    buffer_store_byte v110, v0, s[0:3], 0 offen offset:116
+; ALIGNED-NEXT:    buffer_store_byte v109, v0, s[0:3], 0 offen offset:115
+; ALIGNED-NEXT:    buffer_store_byte v108, v0, s[0:3], 0 offen offset:114
+; ALIGNED-NEXT:    buffer_store_byte v107, v0, s[0:3], 0 offen offset:113
+; ALIGNED-NEXT:    buffer_store_byte v106, v0, s[0:3], 0 offen offset:112
+; ALIGNED-NEXT:    buffer_store_byte v105, v0, s[0:3], 0 offen offset:111
+; ALIGNED-NEXT:    buffer_store_byte v104, v0, s[0:3], 0 offen offset:110
+; ALIGNED-NEXT:    buffer_store_byte v95, v0, s[0:3], 0 offen offset:109
+; ALIGNED-NEXT:    buffer_store_byte v94, v0, s[0:3], 0 offen offset:108
+; ALIGNED-NEXT:    buffer_store_byte v93, v0, s[0:3], 0 offen offset:107
+; ALIGNED-NEXT:    buffer_store_byte v92, v0, s[0:3], 0 offen offset:106
+; ALIGNED-NEXT:    buffer_store_byte v91, v0, s[0:3], 0 offen offset:105
+; ALIGNED-NEXT:    buffer_store_byte v90, v0, s[0:3], 0 offen offset:104
+; ALIGNED-NEXT:    buffer_store_byte v89, v0, s[0:3], 0 offen offset:103
+; ALIGNED-NEXT:    buffer_store_byte v88, v0, s[0:3], 0 offen offset:102
+; ALIGNED-NEXT:    buffer_store_byte v79, v0, s[0:3], 0 offen offset:101
+; ALIGNED-NEXT:    buffer_store_byte v78, v0, s[0:3], 0 offen offset:100
+; ALIGNED-NEXT:    buffer_store_byte v77, v0, s[0:3], 0 offen offset:99
+; ALIGNED-NEXT:    buffer_store_byte v76, v0, s[0:3], 0 offen offset:98
+; ALIGNED-NEXT:    buffer_store_byte v75, v0, s[0:3], 0 offen offset:97
+; ALIGNED-NEXT:    buffer_store_byte v74, v0, s[0:3], 0 offen offset:96
+; ALIGNED-NEXT:    buffer_store_byte v73, v0, s[0:3], 0 offen offset:95
+; ALIGNED-NEXT:    buffer_store_byte v72, v0, s[0:3], 0 offen offset:94
+; ALIGNED-NEXT:    buffer_store_byte v63, v0, s[0:3], 0 offen offset:93
+; ALIGNED-NEXT:    buffer_store_byte v62, v0, s[0:3], 0 offen offset:92
+; ALIGNED-NEXT:    buffer_store_byte v61, v0, s[0:3], 0 offen offset:91
+; ALIGNED-NEXT:    buffer_store_byte v60, v0, s[0:3], 0 offen offset:90
+; ALIGNED-NEXT:    buffer_store_byte v59, v0, s[0:3], 0 offen offset:89
+; ALIGNED-NEXT:    buffer_store_byte v58, v0, s[0:3], 0 offen offset:88
+; ALIGNED-NEXT:    buffer_store_byte v57, v0, s[0:3], 0 offen offset:87
+; ALIGNED-NEXT:    buffer_store_byte v56, v0, s[0:3], 0 offen offset:86
+; ALIGNED-NEXT:    buffer_store_byte v47, v0, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT:    buffer_store_byte v46, v0, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT:    buffer_store_byte v45, v0, s[0:3], 0 offen offset:83
+; ALIGNED-NEXT:    buffer_store_byte v44, v0, s[0:3], 0 offen offset:82
+; ALIGNED-NEXT:    buffer_store_byte v43, v0, s[0:3], 0 offen offset:81
+; ALIGNED-NEXT:    buffer_store_byte v42, v0, s[0:3], 0 offen offset:80
+; ALIGNED-NEXT:    buffer_store_byte v41, v0, s[0:3], 0 offen offset:79
+; ALIGNED-NEXT:    buffer_store_byte v40, v0, s[0:3], 0 offen offset:78
+; ALIGNED-NEXT:    buffer_store_byte v119, v0, s[0:3], 0 offen offset:77
+; ALIGNED-NEXT:    buffer_store_byte v118, v0, s[0:3], 0 offen offset:76
+; ALIGNED-NEXT:    buffer_store_byte v117, v0, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT:    buffer_store_byte v116, v0, s[0:3], 0 offen offset:74
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:73
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:72
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:71
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:70
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:69
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:68
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:67
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:66
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:65
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:64
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:63
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:62
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:61
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:60
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:59
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:55
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:54
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:53
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:52
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:51
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:50
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:49
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:48
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:47
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:46
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:45
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:44
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:43
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:42
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:41
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:40
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:39
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:38
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:37
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:36
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:35
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:34
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:33
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:32
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:31
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:29
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:28
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:27
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:24
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:23
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:22
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:21
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:20
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:18
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:11
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:1
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen
+; ALIGNED-NEXT:    v_add_nc_u32_e32 v0, 0xffffff00, v0
+; ALIGNED-NEXT:    s_cbranch_scc0 .LBB8_5
+; ALIGNED-NEXT:  .LBB8_6: ; %Flow19
+; ALIGNED-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; ALIGNED-NEXT:    s_clause 0x2f
+; ALIGNED-NEXT:    buffer_load_dword v127, off, s[0:3], s32
+; ALIGNED-NEXT:    buffer_load_dword v126, off, s[0:3], s32 offset:4
+; ALIGNED-NEXT:    buffer_load_dword v125, off, s[0:3], s32 offset:8
+; ALIGNED-NEXT:    buffer_load_dword v124, off, s[0:3], s32 offset:12
+; ALIGNED-NEXT:    buffer_load_dword v123, off, s[0:3], s32 offset:16
+; ALIGNED-NEXT:    buffer_load_dword v122, off, s[0:3], s32 offset:20
+; ALIGNED-NEXT:    buffer_load_dword v121, off, s[0:3], s32 offset:24
+; ALIGNED-NEXT:    buffer_load_dword v120, off, s[0:3], s32 offset:28
+; ALIGNED-NEXT:    buffer_load_dword v111, off, s[0:3], s32 offset:32
+; ALIGNED-NEXT:    buffer_load_dword v110, off, s[0:3], s32 offset:36
+; ALIGNED-NEXT:    buffer_load_dword v109, off, s[0:3], s32 offset:40
+; ALIGNED-NEXT:    buffer_load_dword v108, off, s[0:3], s32 offset:44
+; ALIGNED-NEXT:    buffer_load_dword v107, off, s[0:3], s32 offset:48
+; ALIGNED-NEXT:    buffer_load_dword v106, off, s[0:3], s32 offset:52
+; ALIGNED-NEXT:    buffer_load_dword v105, off, s[0:3], s32 offset:56
+; ALIGNED-NEXT:    buffer_load_dword v104, off, s[0:3], s32 offset:60
+; ALIGNED-NEXT:    buffer_load_dword v95, off, s[0:3], s32 offset:64
+; ALIGNED-NEXT:    buffer_load_dword v94, off, s[0:3], s32 offset:68
+; ALIGNED-NEXT:    buffer_load_dword v93, off, s[0:3], s32 offset:72
+; ALIGNED-NEXT:    buffer_load_dword v92, off, s[0:3], s32 offset:76
+; ALIGNED-NEXT:    buffer_load_dword v91, off, s[0:3], s32 offset:80
+; ALIGNED-NEXT:    buffer_load_dword v90, off, s[0:3], s32 offset:84
+; ALIGNED-NEXT:    buffer_load_dword v89, off, s[0:3], s32 offset:88
+; ALIGNED-NEXT:    buffer_load_dword v88, off, s[0:3], s32 offset:92
+; ALIGNED-NEXT:    buffer_load_dword v79, off, s[0:3], s32 offset:96
+; ALIGNED-NEXT:    buffer_load_dword v78, off, s[0:3], s32 offset:100
+; ALIGNED-NEXT:    buffer_load_dword v77, off, s[0:3], s32 offset:104
+; ALIGNED-NEXT:    buffer_load_dword v76, off, s[0:3], s32 offset:108
+; ALIGNED-NEXT:    buffer_load_dword v75, off, s[0:3], s32 offset:112
+; ALIGNED-NEXT:    buffer_load_dword v74, off, s[0:3], s32 offset:116
+; ALIGNED-NEXT:    buffer_load_dword v73, off, s[0:3], s32 offset:120
+; ALIGNED-NEXT:    buffer_load_dword v72, off, s[0:3], s32 offset:124
+; ALIGNED-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:128
+; ALIGNED-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:132
+; ALIGNED-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:136
+; ALIGNED-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:140
+; ALIGNED-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:144
+; ALIGNED-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:148
+; ALIGNED-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:152
+; ALIGNED-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:156
+; ALIGNED-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:160
+; ALIGNED-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:164
+; ALIGNED-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:168
+; ALIGNED-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:172
+; ALIGNED-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:176
+; ALIGNED-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:180
+; ALIGNED-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:184
+; ALIGNED-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:188
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNROLL3-LABEL: memmove_p5_p5_sz2048:
+; UNROLL3:       ; %bb.0: ; %entry
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNROLL3-NEXT:    s_mov_b32 s4, exec_lo
+; UNROLL3-NEXT:    v_cmpx_ge_u32_e64 v1, v0
+; UNROLL3-NEXT:    s_xor_b32 s6, exec_lo, s4
+; UNROLL3-NEXT:    s_cbranch_execz .LBB8_4
+; UNROLL3-NEXT:  ; %bb.1: ; %memmove_fwd_loop.preheader
+; UNROLL3-NEXT:    v_mov_b32_e32 v2, v1
+; UNROLL3-NEXT:    v_mov_b32_e32 v3, v0
+; UNROLL3-NEXT:    s_mov_b64 s[4:5], 0x7e0
+; UNROLL3-NEXT:  .LBB8_2: ; %memmove_fwd_loop
+; UNROLL3-NEXT:    ; =>This Inner Loop Header: Depth=1
+; UNROLL3-NEXT:    s_clause 0xb
+; UNROLL3-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:44
+; UNROLL3-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:40
+; UNROLL3-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:36
+; UNROLL3-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:32
+; UNROLL3-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:28
+; UNROLL3-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; UNROLL3-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:20
+; UNROLL3-NEXT:    buffer_load_dword v11, v2, s[0:3], 0 offen offset:16
+; UNROLL3-NEXT:    buffer_load_dword v12, v2, s[0:3], 0 offen offset:12
+; UNROLL3-NEXT:    buffer_load_dword v13, v2, s[0:3], 0 offen offset:8
+; UNROLL3-NEXT:    buffer_load_dword v14, v2, s[0:3], 0 offen offset:4
+; UNROLL3-NEXT:    buffer_load_dword v15, v2, s[0:3], 0 offen
+; UNROLL3-NEXT:    v_add_nc_u32_e32 v2, 48, v2
+; UNROLL3-NEXT:    s_add_u32 s4, s4, 0xffffffd0
+; UNROLL3-NEXT:    s_addc_u32 s5, s5, -1
+; UNROLL3-NEXT:    s_waitcnt vmcnt(11)
+; UNROLL3-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen offset:44
+; UNROLL3-NEXT:    s_waitcnt vmcnt(10)
+; UNROLL3-NEXT:    buffer_store_dword v5, v3, s[0:3], 0 offen offset:40
+; UNROLL3-NEXT:    s_waitcnt vmcnt(9)
+; UNROLL3-NEXT:    buffer_store_dword v6, v3, s[0:3], 0 offen offset:36
+; UNROLL3-NEXT:    s_waitcnt vmcnt(8)
+; UNROLL3-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen offset:32
+; UNROLL3-NEXT:    s_waitcnt vmcnt(7)
+; UNROLL3-NEXT:    buffer_store_dword v8, v3, s[0:3], 0 offen offset:28
+; UNROLL3-NEXT:    s_waitcnt vmcnt(6)
+; UNROLL3-NEXT:    buffer_store_dword v9, v3, s[0:3], 0 offen offset:24
+; UNROLL3-NEXT:    s_waitcnt vmcnt(5)
+; UNROLL3-NEXT:    buffer_store_dword v10, v3, s[0:3], 0 offen offset:20
+; UNROLL3-NEXT:    s_waitcnt vmcnt(4)
+; UNROLL3-NEXT:    buffer_store_dword v11, v3, s[0:3], 0 offen offset:16
+; UNROLL3-NEXT:    s_waitcnt vmcnt(3)
+; UNROLL3-NEXT:    buffer_store_dword v12, v3, s[0:3], 0 offen offset:12
+; UNROLL3-NEXT:    s_waitcnt vmcnt(2)
+; UNROLL3-NEXT:    buffer_store_dword v13, v3, s[0:3], 0 offen offset:8
+; UNROLL3-NEXT:    s_waitcnt vmcnt(1)
+; UNROLL3-NEXT:    buffer_store_dword v14, v3, s[0:3], 0 offen offset:4
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    buffer_store_dword v15, v3, s[0:3], 0 offen
+; UNROLL3-NEXT:    v_add_nc_u32_e32 v3, 48, v3
+; UNROLL3-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; UNROLL3-NEXT:    s_cbranch_scc1 .LBB8_2
+; UNROLL3-NEXT:  ; %bb.3: ; %memmove_fwd_residual
+; UNROLL3-NEXT:    s_clause 0x3
+; UNROLL3-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:2028
+; UNROLL3-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:2024
+; UNROLL3-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:2020
+; UNROLL3-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:2016
+; UNROLL3-NEXT:    s_waitcnt vmcnt(3)
+; UNROLL3-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:2028
+; UNROLL3-NEXT:    s_waitcnt vmcnt(2)
+; UNROLL3-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:2024
+; UNROLL3-NEXT:    s_waitcnt vmcnt(1)
+; UNROLL3-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:2020
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:2016
+; UNROLL3-NEXT:    s_clause 0x3
+; UNROLL3-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:2044
+; UNROLL3-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:2040
+; UNROLL3-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:2036
+; UNROLL3-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:2032
+; UNROLL3-NEXT:    s_waitcnt vmcnt(3)
+; UNROLL3-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:2044
+; UNROLL3-NEXT:    s_waitcnt vmcnt(2)
+; UNROLL3-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:2040
+; UNROLL3-NEXT:    s_waitcnt vmcnt(1)
+; UNROLL3-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:2036
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:2032
+; UNROLL3-NEXT:    ; implicit-def: $vgpr1
+; UNROLL3-NEXT:    ; implicit-def: $vgpr0
+; UNROLL3-NEXT:  .LBB8_4: ; %Flow16
+; UNROLL3-NEXT:    s_andn2_saveexec_b32 s6, s6
+; UNROLL3-NEXT:    s_cbranch_execz .LBB8_7
+; UNROLL3-NEXT:  ; %bb.5: ; %memmove_bwd_residual
+; UNROLL3-NEXT:    s_clause 0x3
+; UNROLL3-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:2044
+; UNROLL3-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:2040
+; UNROLL3-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:2036
+; UNROLL3-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:2032
+; UNROLL3-NEXT:    s_movk_i32 s4, 0xf820
+; UNROLL3-NEXT:    s_mov_b32 s5, -1
+; UNROLL3-NEXT:    s_waitcnt vmcnt(3)
+; UNROLL3-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:2044
+; UNROLL3-NEXT:    s_waitcnt vmcnt(2)
+; UNROLL3-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:2040
+; UNROLL3-NEXT:    s_waitcnt vmcnt(1)
+; UNROLL3-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:2036
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:2032
+; UNROLL3-NEXT:    s_clause 0x3
+; UNROLL3-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:2028
+; UNROLL3-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:2024
+; UNROLL3-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:2020
+; UNROLL3-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:2016
+; UNROLL3-NEXT:    v_add_nc_u32_e32 v2, 0x7b0, v0
+; UNROLL3-NEXT:    v_add_nc_u32_e32 v1, 0x7b0, v1
+; UNROLL3-NEXT:    s_waitcnt vmcnt(3)
+; UNROLL3-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:2028
+; UNROLL3-NEXT:    s_waitcnt vmcnt(2)
+; UNROLL3-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:2024
+; UNROLL3-NEXT:    s_waitcnt vmcnt(1)
+; UNROLL3-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:2020
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:2016
+; UNROLL3-NEXT:  .LBB8_6: ; %memmove_bwd_loop
+; UNROLL3-NEXT:    ; =>This Inner Loop Header: Depth=1
+; UNROLL3-NEXT:    s_clause 0xb
+; UNROLL3-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 offen offset:44
+; UNROLL3-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:40
+; UNROLL3-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:36
+; UNROLL3-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:32
+; UNROLL3-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:28
+; UNROLL3-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:24
+; UNROLL3-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:20
+; UNROLL3-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:16
+; UNROLL3-NEXT:    buffer_load_dword v10, v1, s[0:3], 0 offen offset:12
+; UNROLL3-NEXT:    buffer_load_dword v11, v1, s[0:3], 0 offen offset:8
+; UNROLL3-NEXT:    buffer_load_dword v12, v1, s[0:3], 0 offen offset:4
+; UNROLL3-NEXT:    buffer_load_dword v13, v1, s[0:3], 0 offen
+; UNROLL3-NEXT:    v_subrev_nc_u32_e32 v1, 48, v1
+; UNROLL3-NEXT:    s_add_u32 s4, s4, 48
+; UNROLL3-NEXT:    s_addc_u32 s5, s5, 0
+; UNROLL3-NEXT:    s_waitcnt vmcnt(11)
+; UNROLL3-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen offset:44
+; UNROLL3-NEXT:    s_waitcnt vmcnt(10)
+; UNROLL3-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:40
+; UNROLL3-NEXT:    s_waitcnt vmcnt(9)
+; UNROLL3-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen offset:36
+; UNROLL3-NEXT:    s_waitcnt vmcnt(8)
+; UNROLL3-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen offset:32
+; UNROLL3-NEXT:    s_waitcnt vmcnt(7)
+; UNROLL3-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen offset:28
+; UNROLL3-NEXT:    s_waitcnt vmcnt(6)
+; UNROLL3-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen offset:24
+; UNROLL3-NEXT:    s_waitcnt vmcnt(5)
+; UNROLL3-NEXT:    buffer_store_dword v8, v2, s[0:3], 0 offen offset:20
+; UNROLL3-NEXT:    s_waitcnt vmcnt(4)
+; UNROLL3-NEXT:    buffer_store_dword v9, v2, s[0:3], 0 offen offset:16
+; UNROLL3-NEXT:    s_waitcnt vmcnt(3)
+; UNROLL3-NEXT:    buffer_store_dword v10, v2, s[0:3], 0 offen offset:12
+; UNROLL3-NEXT:    s_waitcnt vmcnt(2)
+; UNROLL3-NEXT:    buffer_store_dword v11, v2, s[0:3], 0 offen offset:8
+; UNROLL3-NEXT:    s_waitcnt vmcnt(1)
+; UNROLL3-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen offset:4
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
+; UNROLL3-NEXT:    v_subrev_nc_u32_e32 v2, 48, v2
+; UNROLL3-NEXT:    s_cmp_eq_u64 s[4:5], 0
+; UNROLL3-NEXT:    s_cbranch_scc0 .LBB8_6
+; UNROLL3-NEXT:  .LBB8_7: ; %Flow17
+; UNROLL3-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; UNROLL3-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 2048, i1 false)
+  ret void
+}
+
+define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p0_p5_sz2048:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:    s_mov_b32 s6, exec_lo
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, -1, v0, vcc_lo
+; CHECK-NEXT:    v_cmpx_ge_u32_e64 v2, v3
+; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s6
+; CHECK-NEXT:    s_cbranch_execz .LBB9_2
+; CHECK-NEXT:  .LBB9_1: ; %memmove_fwd_loop
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_clause 0x3e
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:32
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:36
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:40
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:44
+; CHECK-NEXT:    buffer_load_dword v11, v2, s[0:3], 0 offen offset:48
+; CHECK-NEXT:    buffer_load_dword v12, v2, s[0:3], 0 offen offset:52
+; CHECK-NEXT:    buffer_load_dword v13, v2, s[0:3], 0 offen offset:56
+; CHECK-NEXT:    buffer_load_dword v14, v2, s[0:3], 0 offen offset:60
+; CHECK-NEXT:    buffer_load_dword v18, v2, s[0:3], 0 offen offset:92
+; CHECK-NEXT:    buffer_load_dword v17, v2, s[0:3], 0 offen offset:88
+; CHECK-NEXT:    buffer_load_dword v16, v2, s[0:3], 0 offen offset:84
+; CHECK-NEXT:    buffer_load_dword v15, v2, s[0:3], 0 offen offset:80
+; CHECK-NEXT:    buffer_load_dword v22, v2, s[0:3], 0 offen offset:124
+; CHECK-NEXT:    buffer_load_dword v21, v2, s[0:3], 0 offen offset:120
+; CHECK-NEXT:    buffer_load_dword v20, v2, s[0:3], 0 offen offset:116
+; CHECK-NEXT:    buffer_load_dword v19, v2, s[0:3], 0 offen offset:112
+; CHECK-NEXT:    buffer_load_dword v26, v2, s[0:3], 0 offen offset:108
+; CHECK-NEXT:    buffer_load_dword v25, v2, s[0:3], 0 offen offset:104
+; CHECK-NEXT:    buffer_load_dword v24, v2, s[0:3], 0 offen offset:100
+; CHECK-NEXT:    buffer_load_dword v23, v2, s[0:3], 0 offen offset:96
+; CHECK-NEXT:    buffer_load_dword v30, v2, s[0:3], 0 offen offset:156
+; CHECK-NEXT:    buffer_load_dword v29, v2, s[0:3], 0 offen offset:152
+; CHECK-NEXT:    buffer_load_dword v28, v2, s[0:3], 0 offen offset:148
+; CHECK-NEXT:    buffer_load_dword v27, v2, s[0:3], 0 offen offset:144
+; CHECK-NEXT:    buffer_load_dword v34, v2, s[0:3], 0 offen offset:188
+; CHECK-NEXT:    buffer_load_dword v33, v2, s[0:3], 0 offen offset:184
+; CHECK-NEXT:    buffer_load_dword v32, v2, s[0:3], 0 offen offset:180
+; CHECK-NEXT:    buffer_load_dword v31, v2, s[0:3], 0 offen offset:176
+; CHECK-NEXT:    buffer_load_dword v38, v2, s[0:3], 0 offen offset:172
+; CHECK-NEXT:    buffer_load_dword v37, v2, s[0:3], 0 offen offset:168
+; CHECK-NEXT:    buffer_load_dword v36, v2, s[0:3], 0 offen offset:164
+; CHECK-NEXT:    buffer_load_dword v35, v2, s[0:3], 0 offen offset:160
+; CHECK-NEXT:    buffer_load_dword v51, v2, s[0:3], 0 offen offset:220
+; CHECK-NEXT:    buffer_load_dword v50, v2, s[0:3], 0 offen offset:216
+; CHECK-NEXT:    buffer_load_dword v49, v2, s[0:3], 0 offen offset:212
+; CHECK-NEXT:    buffer_load_dword v48, v2, s[0:3], 0 offen offset:208
+; CHECK-NEXT:    buffer_load_dword v55, v2, s[0:3], 0 offen offset:252
+; CHECK-NEXT:    buffer_load_dword v54, v2, s[0:3], 0 offen offset:248
+; CHECK-NEXT:    buffer_load_dword v53, v2, s[0:3], 0 offen offset:244
+; CHECK-NEXT:    buffer_load_dword v52, v2, s[0:3], 0 offen offset:240
+; CHECK-NEXT:    buffer_load_dword v67, v2, s[0:3], 0 offen offset:236
+; CHECK-NEXT:    buffer_load_dword v66, v2, s[0:3], 0 offen offset:232
+; CHECK-NEXT:    buffer_load_dword v65, v2, s[0:3], 0 offen offset:228
+; CHECK-NEXT:    buffer_load_dword v64, v2, s[0:3], 0 offen offset:224
+; CHECK-NEXT:    buffer_load_dword v71, v2, s[0:3], 0 offen offset:204
+; CHECK-NEXT:    buffer_load_dword v70, v2, s[0:3], 0 offen offset:200
+; CHECK-NEXT:    buffer_load_dword v69, v2, s[0:3], 0 offen offset:196
+; CHECK-NEXT:    buffer_load_dword v68, v2, s[0:3], 0 offen offset:192
+; CHECK-NEXT:    buffer_load_dword v83, v2, s[0:3], 0 offen offset:140
+; CHECK-NEXT:    buffer_load_dword v82, v2, s[0:3], 0 offen offset:136
+; CHECK-NEXT:    buffer_load_dword v81, v2, s[0:3], 0 offen offset:132
+; CHECK-NEXT:    buffer_load_dword v80, v2, s[0:3], 0 offen offset:128
+; CHECK-NEXT:    buffer_load_dword v87, v2, s[0:3], 0 offen offset:76
+; CHECK-NEXT:    buffer_load_dword v86, v2, s[0:3], 0 offen offset:72
+; CHECK-NEXT:    buffer_load_dword v85, v2, s[0:3], 0 offen offset:68
+; CHECK-NEXT:    buffer_load_dword v84, v2, s[0:3], 0 offen offset:64
+; CHECK-NEXT:    buffer_load_dword v96, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v97, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v98, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v99, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    v_add_co_u32 v100, vcc_lo, v0, s4
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT:    s_add_u32 s4, s4, 0x100
+; CHECK-NEXT:    v_add_nc_u32_e32 v2, 0x100, v2
+; CHECK-NEXT:    s_addc_u32 s5, s5, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[52:55] offset:240
+; CHECK-NEXT:    s_waitcnt vmcnt(16)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[64:67] offset:224
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[48:51] offset:208
+; CHECK-NEXT:    s_waitcnt vmcnt(12)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[68:71] offset:192
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[31:34] offset:176
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[35:38] offset:160
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[27:30] offset:144
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[80:83] offset:128
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[19:22] offset:112
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[23:26] offset:96
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[15:18] offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[84:87] offset:64
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[11:14] offset:48
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[7:10] offset:32
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[3:6] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[96:99]
+; CHECK-NEXT:    s_cmp_lg_u64 s[4:5], 0x800
+; CHECK-NEXT:    s_cbranch_scc1 .LBB9_1
+; CHECK-NEXT:  .LBB9_2: ; %Flow10
+; CHECK-NEXT:    s_andn2_saveexec_b32 s8, s6
+; CHECK-NEXT:    s_cbranch_execz .LBB9_5
+; CHECK-NEXT:  ; %bb.3: ; %memmove_bwd_loop.preheader
+; CHECK-NEXT:    v_add_nc_u32_e32 v2, 0x700, v2
+; CHECK-NEXT:    s_movk_i32 s6, 0xff00
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0x700
+; CHECK-NEXT:    s_mov_b32 s7, -1
+; CHECK-NEXT:  .LBB9_4: ; %memmove_bwd_loop
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_clause 0x3e
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:32
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:36
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:40
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:44
+; CHECK-NEXT:    buffer_load_dword v11, v2, s[0:3], 0 offen offset:48
+; CHECK-NEXT:    buffer_load_dword v12, v2, s[0:3], 0 offen offset:52
+; CHECK-NEXT:    buffer_load_dword v13, v2, s[0:3], 0 offen offset:56
+; CHECK-NEXT:    buffer_load_dword v14, v2, s[0:3], 0 offen offset:60
+; CHECK-NEXT:    buffer_load_dword v18, v2, s[0:3], 0 offen offset:124
+; CHECK-NEXT:    buffer_load_dword v17, v2, s[0:3], 0 offen offset:120
+; CHECK-NEXT:    buffer_load_dword v16, v2, s[0:3], 0 offen offset:116
+; CHECK-NEXT:    buffer_load_dword v15, v2, s[0:3], 0 offen offset:112
+; CHECK-NEXT:    buffer_load_dword v22, v2, s[0:3], 0 offen offset:108
+; CHECK-NEXT:    buffer_load_dword v21, v2, s[0:3], 0 offen offset:104
+; CHECK-NEXT:    buffer_load_dword v20, v2, s[0:3], 0 offen offset:100
+; CHECK-NEXT:    buffer_load_dword v19, v2, s[0:3], 0 offen offset:96
+; CHECK-NEXT:    buffer_load_dword v26, v2, s[0:3], 0 offen offset:252
+; CHECK-NEXT:    buffer_load_dword v25, v2, s[0:3], 0 offen offset:248
+; CHECK-NEXT:    buffer_load_dword v24, v2, s[0:3], 0 offen offset:244
+; CHECK-NEXT:    buffer_load_dword v23, v2, s[0:3], 0 offen offset:240
+; CHECK-NEXT:    buffer_load_dword v30, v2, s[0:3], 0 offen offset:236
+; CHECK-NEXT:    buffer_load_dword v29, v2, s[0:3], 0 offen offset:232
+; CHECK-NEXT:    buffer_load_dword v28, v2, s[0:3], 0 offen offset:228
+; CHECK-NEXT:    buffer_load_dword v27, v2, s[0:3], 0 offen offset:224
+; CHECK-NEXT:    buffer_load_dword v34, v2, s[0:3], 0 offen offset:220
+; CHECK-NEXT:    buffer_load_dword v33, v2, s[0:3], 0 offen offset:216
+; CHECK-NEXT:    buffer_load_dword v32, v2, s[0:3], 0 offen offset:212
+; CHECK-NEXT:    buffer_load_dword v31, v2, s[0:3], 0 offen offset:208
+; CHECK-NEXT:    buffer_load_dword v38, v2, s[0:3], 0 offen offset:204
+; CHECK-NEXT:    buffer_load_dword v37, v2, s[0:3], 0 offen offset:200
+; CHECK-NEXT:    buffer_load_dword v36, v2, s[0:3], 0 offen offset:196
+; CHECK-NEXT:    buffer_load_dword v35, v2, s[0:3], 0 offen offset:192
+; CHECK-NEXT:    buffer_load_dword v51, v2, s[0:3], 0 offen offset:188
+; CHECK-NEXT:    buffer_load_dword v50, v2, s[0:3], 0 offen offset:184
+; CHECK-NEXT:    buffer_load_dword v49, v2, s[0:3], 0 offen offset:180
+; CHECK-NEXT:    buffer_load_dword v48, v2, s[0:3], 0 offen offset:176
+; CHECK-NEXT:    buffer_load_dword v55, v2, s[0:3], 0 offen offset:172
+; CHECK-NEXT:    buffer_load_dword v54, v2, s[0:3], 0 offen offset:168
+; CHECK-NEXT:    buffer_load_dword v53, v2, s[0:3], 0 offen offset:164
+; CHECK-NEXT:    buffer_load_dword v52, v2, s[0:3], 0 offen offset:160
+; CHECK-NEXT:    buffer_load_dword v67, v2, s[0:3], 0 offen offset:156
+; CHECK-NEXT:    buffer_load_dword v66, v2, s[0:3], 0 offen offset:152
+; CHECK-NEXT:    buffer_load_dword v65, v2, s[0:3], 0 offen offset:148
+; CHECK-NEXT:    buffer_load_dword v64, v2, s[0:3], 0 offen offset:144
+; CHECK-NEXT:    buffer_load_dword v71, v2, s[0:3], 0 offen offset:140
+; CHECK-NEXT:    buffer_load_dword v70, v2, s[0:3], 0 offen offset:136
+; CHECK-NEXT:    buffer_load_dword v69, v2, s[0:3], 0 offen offset:132
+; CHECK-NEXT:    buffer_load_dword v68, v2, s[0:3], 0 offen offset:128
+; CHECK-NEXT:    buffer_load_dword v83, v2, s[0:3], 0 offen offset:92
+; CHECK-NEXT:    buffer_load_dword v82, v2, s[0:3], 0 offen offset:88
+; CHECK-NEXT:    buffer_load_dword v81, v2, s[0:3], 0 offen offset:84
+; CHECK-NEXT:    buffer_load_dword v80, v2, s[0:3], 0 offen offset:80
+; CHECK-NEXT:    buffer_load_dword v87, v2, s[0:3], 0 offen offset:76
+; CHECK-NEXT:    buffer_load_dword v86, v2, s[0:3], 0 offen offset:72
+; CHECK-NEXT:    buffer_load_dword v85, v2, s[0:3], 0 offen offset:68
+; CHECK-NEXT:    buffer_load_dword v84, v2, s[0:3], 0 offen offset:64
+; CHECK-NEXT:    buffer_load_dword v96, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v97, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v98, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v99, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    v_add_co_u32 v100, vcc_lo, v0, s4
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v101, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT:    v_add_nc_u32_e32 v2, 0xffffff00, v2
+; CHECK-NEXT:    s_add_u32 s4, s4, 0xffffff00
+; CHECK-NEXT:    s_addc_u32 s5, s5, -1
+; CHECK-NEXT:    s_waitcnt vmcnt(41)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[23:26] offset:240
+; CHECK-NEXT:    s_waitcnt vmcnt(37)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[27:30] offset:224
+; CHECK-NEXT:    s_waitcnt vmcnt(33)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[31:34] offset:208
+; CHECK-NEXT:    s_waitcnt vmcnt(29)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[35:38] offset:192
+; CHECK-NEXT:    s_waitcnt vmcnt(25)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[48:51] offset:176
+; CHECK-NEXT:    s_waitcnt vmcnt(21)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[52:55] offset:160
+; CHECK-NEXT:    s_waitcnt vmcnt(17)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[64:67] offset:144
+; CHECK-NEXT:    s_waitcnt vmcnt(13)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[68:71] offset:128
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[15:18] offset:112
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[19:22] offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(9)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[80:83] offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[84:87] offset:64
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[11:14] offset:48
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[7:10] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[3:6] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[100:101], v[96:99]
+; CHECK-NEXT:    s_cmp_eq_u64 s[4:5], s[6:7]
+; CHECK-NEXT:    s_cbranch_scc0 .LBB9_4
+; CHECK-NEXT:  .LBB9_5: ; %Flow11
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+;
+; ALIGNED-LABEL: memmove_p0_p5_sz2048:
+; ALIGNED:       ; %bb.0: ; %entry
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v72, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v73, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v74, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v75, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v76, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v77, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v78, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v79, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v88, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v89, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v90, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v91, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v92, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v93, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v94, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v95, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v104, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v105, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v106, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v107, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v108, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v109, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v110, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v111, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v120, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v121, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v122, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v123, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v124, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v125, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v126, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_mov_b64 s[4:5], 0
+; ALIGNED-NEXT:    s_mov_b32 s6, exec_lo
+; ALIGNED-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
+; ALIGNED-NEXT:    v_cmpx_ge_u32_e64 v2, v0
+; ALIGNED-NEXT:    s_xor_b32 s6, exec_lo, s6
+; ALIGNED-NEXT:    s_cbranch_execz .LBB9_2
+; ALIGNED-NEXT:  .LBB9_1: ; %memmove_fwd_loop
+; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ALIGNED-NEXT:    s_clause 0x39
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:24
+; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:30
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:31
+; ALIGNED-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32
+; ALIGNED-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33
+; ALIGNED-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34
+; ALIGNED-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
+; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
+; ALIGNED-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35
+; ALIGNED-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36
+; ALIGNED-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37
+; ALIGNED-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:38
+; ALIGNED-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:39
+; ALIGNED-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:40
+; ALIGNED-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:41
+; ALIGNED-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:42
+; ALIGNED-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:43
+; ALIGNED-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:44
+; ALIGNED-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:45
+; ALIGNED-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:46
+; ALIGNED-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:47
+; ALIGNED-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:48
+; ALIGNED-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:49
+; ALIGNED-NEXT:    buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:50
+; ALIGNED-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:51
+; ALIGNED-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:52
+; ALIGNED-NEXT:    buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:53
+; ALIGNED-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:54
+; ALIGNED-NEXT:    buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:55
+; ALIGNED-NEXT:    buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT:    buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT:    buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:62
+; ALIGNED-NEXT:    buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:63
+; ALIGNED-NEXT:    buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:64
+; ALIGNED-NEXT:    buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:65
+; ALIGNED-NEXT:    buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:66
+; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:61
+; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60
+; ALIGNED-NEXT:    buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59
+; ALIGNED-NEXT:    buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:67
+; ALIGNED-NEXT:    buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:68
+; ALIGNED-NEXT:    buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:69
+; ALIGNED-NEXT:    buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:70
+; ALIGNED-NEXT:    buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71
+; ALIGNED-NEXT:    buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76
+; ALIGNED-NEXT:    buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77
+; ALIGNED-NEXT:    buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78
+; ALIGNED-NEXT:    buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79
+; ALIGNED-NEXT:    buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT:    s_waitcnt vmcnt(57)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(56)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(55)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(54)
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(53)
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(52)
+; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(51)
+; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(50)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(49)
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(48)
+; ALIGNED-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT:    s_waitcnt vmcnt(45)
+; ALIGNED-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(44)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(43)
+; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v9, 8, v5
+; ALIGNED-NEXT:    s_waitcnt vmcnt(41)
+; ALIGNED-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v8, 8, v6
+; ALIGNED-NEXT:    v_lshl_or_b32 v5, v10, 8, v7
+; ALIGNED-NEXT:    v_lshl_or_b32 v6, v11, 8, v12
+; ALIGNED-NEXT:    v_lshl_or_b32 v7, v15, 8, v14
+; ALIGNED-NEXT:    v_lshl_or_b32 v8, v19, 8, v17
+; ALIGNED-NEXT:    s_waitcnt vmcnt(40)
+; ALIGNED-NEXT:    v_lshl_or_b32 v9, v16, 8, v13
+; ALIGNED-NEXT:    s_waitcnt vmcnt(38)
+; ALIGNED-NEXT:    v_lshl_or_b32 v10, v20, 8, v18
+; ALIGNED-NEXT:    s_waitcnt vmcnt(36)
+; ALIGNED-NEXT:    v_lshl_or_b32 v11, v23, 8, v22
+; ALIGNED-NEXT:    s_waitcnt vmcnt(34)
+; ALIGNED-NEXT:    v_lshl_or_b32 v12, v27, 8, v25
+; ALIGNED-NEXT:    s_waitcnt vmcnt(32)
+; ALIGNED-NEXT:    v_lshl_or_b32 v13, v24, 8, v21
+; ALIGNED-NEXT:    s_waitcnt vmcnt(30)
+; ALIGNED-NEXT:    v_lshl_or_b32 v14, v28, 8, v26
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v6, 16, v5
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v8, 16, v7
+; ALIGNED-NEXT:    v_lshl_or_b32 v5, v10, 16, v9
+; ALIGNED-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
+; ALIGNED-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
+; ALIGNED-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(28)
+; ALIGNED-NEXT:    v_lshl_or_b32 v15, v30, 8, v29
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(26)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v32, 8, v34
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(24)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v36, 8, v31
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(22)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v35, 8, v33
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(12)
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v48, 8, v37
+; ALIGNED-NEXT:    v_lshl_or_b32 v5, v39, 8, v38
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v6, v50, 8, v49
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(11)
+; ALIGNED-NEXT:    v_lshl_or_b32 v7, v51, 8, v52
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v0, 16, v15
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v5, 16, v4
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v54, 8, v53
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(11)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v55, 8, v65
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(9)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v66, 8, v64
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v68, 8, v67
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82
+; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v70, 8, v69
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v80, 8, v71
+; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73
+; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72
+; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v64, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v127, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v81, 8, v3
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:98
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:102
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:103
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:94
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:95
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:93
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:91
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:92
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v5, 8, v3
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:90
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:101
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:89
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:88
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:99
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:100
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:97
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:96
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:114
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:118
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:119
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:110
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:111
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:109
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:107
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:108
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v5, 8, v3
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:106
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:117
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:105
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:104
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:115
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:116
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:113
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:112
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:130
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:134
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:135
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:126
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:127
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:125
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:123
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:124
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v5, 8, v3
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:122
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:133
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:121
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:120
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1244 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1236 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:131
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:132
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:129
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:128
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1256 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:142
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:143
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:141
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:139
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:140
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v5, 8, v3
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:138
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:137
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:136
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:144
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:145
+; ALIGNED-NEXT:    buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:146
+; ALIGNED-NEXT:    buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:147
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:148
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v124, 8, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v120, 8, v111
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:149
+; ALIGNED-NEXT:    buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:150
+; ALIGNED-NEXT:    buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:151
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v121, 8, v3
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v109, 8, v122
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:156
+; ALIGNED-NEXT:    buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:157
+; ALIGNED-NEXT:    buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT:    buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT:    buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v105, 8, v108
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v104, 8, v107
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT:    buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT:    buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v92, 8, v93
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v95, 8, v90
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT:    buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT:    buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT:    buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT:    buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT:    buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT:    buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT:    buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v79, 8, v89
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v74, 8, v73
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v72, 8, v76
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v75, 8, v88
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT:    buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT:    buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT:    buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT:    buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v61, 8, v63
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v60, 8, v62
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT:    buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT:    buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v57, 8, v59
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v58, 8, v56
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT:    buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT:    buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT:    buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT:    buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT:    buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT:    buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT:    buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v45, 8, v47
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v42, 8, v41
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v40, 8, v44
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v43, 8, v46
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT:    buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT:    buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT:    buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT:    buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v117, 8, v119
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v116, 8, v118
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT:    buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT:    buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v113, 8, v115
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v114, 8, v112
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT:    buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT:    buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT:    buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT:    buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT:    buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT:    buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT:    buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v100, 8, v102
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v96, 8, v97
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v87, 8, v98
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v99, 8, v101
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT:    buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT:    buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT:    buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v83, 8, v85
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v82, 8, v84
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT:    buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT:    buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v70, 8, v80
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v81, 8, v69
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT:    buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT:    buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT:    buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT:    buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v54, 8, v68
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v53, 8, v66
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT:    buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT:    buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT:    buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT:    buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT:    buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v64, 8, v67
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v50, 8, v49
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v51, 8, v65
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v48, 8, v52
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT:    buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v37, 8, v39
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v55, 8, v38
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT:    buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT:    buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v34, 8, v36
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v30, 8, v31
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v4, 16, v3
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v33, 8, v35
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v29, 8, v32
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x17
+; ALIGNED-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247
+; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT:    v_lshl_or_b32 v110, v4, 16, v3
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT:    buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT:    buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT:    buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT:    s_waitcnt vmcnt(27)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v27, 8, v28
+; ALIGNED-NEXT:    s_waitcnt vmcnt(25)
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v25, 8, v26
+; ALIGNED-NEXT:    s_waitcnt vmcnt(13)
+; ALIGNED-NEXT:    v_lshl_or_b32 v77, v13, 8, v16
+; ALIGNED-NEXT:    s_waitcnt vmcnt(9)
+; ALIGNED-NEXT:    v_lshl_or_b32 v91, v9, 8, v10
+; ALIGNED-NEXT:    v_lshl_or_b32 v94, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v22, 8, v24
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v23, 8, v21
+; ALIGNED-NEXT:    v_lshl_or_b32 v78, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v18, 8, v20
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v14, 8, v15
+; ALIGNED-NEXT:    v_lshl_or_b32 v103, v4, 16, v3
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v17, 8, v19
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v106, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v123, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v125, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v86, v77, 16, v4
+; ALIGNED-NEXT:    v_lshl_or_b32 v77, v11, 8, v12
+; ALIGNED-NEXT:    v_lshl_or_b32 v71, v91, 16, v77
+; ALIGNED-NEXT:    v_lshl_or_b32 v77, v6, 8, v8
+; ALIGNED-NEXT:    v_lshl_or_b32 v91, v7, 8, v5
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v91, 16, v77
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:1
+; ALIGNED-NEXT:    buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v77, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v91, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v77, v77, 8, v1
+; ALIGNED-NEXT:    v_lshl_or_b32 v91, v0, 8, v91
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v91, 16, v77
+; ALIGNED-NEXT:    v_lshl_or_b32 v77, v123, 8, v106
+; ALIGNED-NEXT:    v_lshl_or_b32 v91, v3, 8, v125
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v91, 16, v77
+; ALIGNED-NEXT:    buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT:    buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:11
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v77, v3, 8, v1
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v91, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v91, v91, 8, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v91, 16, v77
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT:    buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT:    buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v77, v125, 8, v1
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v91, v126, 8, v123
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v91, 16, v77
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:16
+; ALIGNED-NEXT:    buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:18
+; ALIGNED-NEXT:    buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:232
+; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:236
+; ALIGNED-NEXT:    buffer_store_dword v86, off, s[0:3], s32 offset:228
+; ALIGNED-NEXT:    buffer_store_dword v103, off, s[0:3], s32 offset:224
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:704
+; ALIGNED-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:708
+; ALIGNED-NEXT:    v_add_nc_u32_e32 v2, 0x100, v2
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v127, 8, v77
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    v_lshl_or_b32 v127, v91, 8, v106
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_add_co_u32 v3, vcc_lo, v3, s4
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, s5, v4, vcc_lo
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v5 offset:250
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v7 offset:251
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v6 offset:249
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v9 offset:255
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v11 offset:253
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v10 offset:254
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v12 offset:252
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v8 offset:248
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v15 offset:242
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v14 offset:243
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v18 offset:241
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v13 offset:247
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v17 offset:245
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v16 offset:246
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v19 offset:244
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v20 offset:240
+; ALIGNED-NEXT:    buffer_store_dword v78, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT:    buffer_store_dword v94, off, s[0:3], s32 offset:252
+; ALIGNED-NEXT:    buffer_store_dword v110, off, s[0:3], s32 offset:244
+; ALIGNED-NEXT:    v_lshl_or_b32 v127, v0, 16, v127
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_add_u32 s4, s4, 0x100
+; ALIGNED-NEXT:    s_addc_u32 s5, s5, 0
+; ALIGNED-NEXT:    s_cmp_lg_u64 s[4:5], 0x800
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:240
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v21 offset:234
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v23 offset:235
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v22 offset:233
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v25 offset:239
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v27 offset:237
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v26 offset:238
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v28 offset:236
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v24 offset:232
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v31 offset:226
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v30 offset:227
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v34 offset:225
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v29 offset:231
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v33 offset:229
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v32 offset:230
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v35 offset:228
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v36 offset:224
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:204
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:196
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v54 offset:213
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v53 offset:215
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v37 offset:209
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v55 offset:211
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v38 offset:210
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v66 offset:214
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v68 offset:212
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v49 offset:218
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v50 offset:219
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v64 offset:217
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v48 offset:223
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v51 offset:221
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v52 offset:222
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v65 offset:220
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v67 offset:216
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v39 offset:208
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:212
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v69 offset:202
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v81 offset:203
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v70 offset:201
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v82 offset:207
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v83 offset:205
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v84 offset:206
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v85 offset:204
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v80 offset:200
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v97 offset:194
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v96 offset:195
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v100 offset:193
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v87 offset:199
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v99 offset:197
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v98 offset:198
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v101 offset:196
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v102 offset:192
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:300
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v112 offset:186
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v114 offset:187
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v113 offset:185
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v116 offset:191
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v117 offset:189
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v118 offset:190
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v119 offset:188
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v115 offset:184
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v41 offset:178
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v42 offset:179
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v45 offset:177
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v40 offset:183
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v43 offset:181
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v44 offset:182
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v46 offset:180
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v47 offset:176
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:312
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:316
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:308
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v56 offset:170
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v58 offset:171
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v57 offset:169
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v60 offset:175
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v61 offset:173
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v62 offset:174
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v63 offset:172
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v59 offset:168
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v73 offset:162
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v74 offset:163
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v79 offset:161
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v72 offset:167
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v75 offset:165
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v76 offset:166
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v88 offset:164
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v89 offset:160
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:264
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:260
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v90 offset:154
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v95 offset:155
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v92 offset:153
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v104 offset:159
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v105 offset:157
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v107 offset:158
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v108 offset:156
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v93 offset:152
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v111 offset:146
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v120 offset:147
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v124 offset:145
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v109 offset:151
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v121 offset:149
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v122 offset:150
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:148
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:144
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:284
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:272
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:138
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:139
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:137
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:143
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:141
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:142
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:140
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:136
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:130
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:131
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1256 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:129
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:135
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:133
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:134
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:132
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:128
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:360
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:364
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:356
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:352
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1244 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:122
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1240 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:123
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1236 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:121
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:127
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:125
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:126
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:124
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:120
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:114
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:115
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:113
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:119
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:117
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:118
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:116
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:112
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:376
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:380
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:372
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:368
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:106
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:107
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:105
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:111
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:109
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:110
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:108
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:104
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:98
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:99
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:97
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:103
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:101
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:102
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:100
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:96
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:328
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:332
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:324
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:90
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:91
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:89
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:95
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:93
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:94
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:92
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:88
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:82
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:83
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:81
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:87
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:85
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:86
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:84
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:80
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:344
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:348
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:340
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:336
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:74
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:75
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:73
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:79
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:77
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:78
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:76
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:72
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:66
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:67
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:65
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:71
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:69
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:70
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:68
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:64
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:424
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:428
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:420
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:61
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:58
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:59
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:57
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:63
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:62
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:60
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:56
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:53
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:50
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:51
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:49
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:55
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:54
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:52
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:48
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:444
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:440
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:436
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:432
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:43
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:42
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:41
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:40
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:47
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:46
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:45
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:44
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:35
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:34
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:33
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:32
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:39
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:38
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:37
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:36
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:392
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:396
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:388
+; ALIGNED-NEXT:    buffer_store_dword v127, off, s[0:3], s32 offset:384
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:26
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:27
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:25
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:31
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:29
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:30
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:28
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:24
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v77 offset:18
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:19
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v91 offset:17
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:23
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:21
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:22
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:20
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v106 offset:16
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:408
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:412
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:404
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:400
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v123 offset:10
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v126 offset:11
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:13
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v125 offset:9
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:15
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:14
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:12
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v1 offset:8
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:2
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:3
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:1
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:7
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:5
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:6
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:4
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v0
+; ALIGNED-NEXT:    s_cbranch_scc1 .LBB9_1
+; ALIGNED-NEXT:  .LBB9_2: ; %Flow10
+; ALIGNED-NEXT:    s_andn2_saveexec_b32 s8, s6
+; ALIGNED-NEXT:    s_cbranch_execz .LBB9_5
+; ALIGNED-NEXT:  ; %bb.3: ; %memmove_bwd_loop.preheader
+; ALIGNED-NEXT:    v_add_nc_u32_e32 v4, 0x700, v2
+; ALIGNED-NEXT:    s_movk_i32 s6, 0xff00
+; ALIGNED-NEXT:    s_mov_b64 s[4:5], 0x700
+; ALIGNED-NEXT:    s_mov_b32 s7, -1
+; ALIGNED-NEXT:  .LBB9_4: ; %memmove_bwd_loop
+; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ALIGNED-NEXT:    s_clause 0x3a
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:20
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:21
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:22
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:23
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:24
+; ALIGNED-NEXT:    buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT:    buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:30
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:31
+; ALIGNED-NEXT:    buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:32
+; ALIGNED-NEXT:    buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:33
+; ALIGNED-NEXT:    buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:34
+; ALIGNED-NEXT:    buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:29
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:28
+; ALIGNED-NEXT:    buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27
+; ALIGNED-NEXT:    buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:35
+; ALIGNED-NEXT:    buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:36
+; ALIGNED-NEXT:    buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:37
+; ALIGNED-NEXT:    buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:38
+; ALIGNED-NEXT:    buffer_load_ubyte v20, v4, s[0:3], 0 offen offset:39
+; ALIGNED-NEXT:    buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:40
+; ALIGNED-NEXT:    buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:41
+; ALIGNED-NEXT:    buffer_load_ubyte v25, v4, s[0:3], 0 offen offset:42
+; ALIGNED-NEXT:    buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:43
+; ALIGNED-NEXT:    buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:44
+; ALIGNED-NEXT:    buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:45
+; ALIGNED-NEXT:    buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:46
+; ALIGNED-NEXT:    buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:47
+; ALIGNED-NEXT:    buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:48
+; ALIGNED-NEXT:    buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:49
+; ALIGNED-NEXT:    buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:50
+; ALIGNED-NEXT:    buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:51
+; ALIGNED-NEXT:    buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:52
+; ALIGNED-NEXT:    buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:53
+; ALIGNED-NEXT:    buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:54
+; ALIGNED-NEXT:    buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:55
+; ALIGNED-NEXT:    buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT:    buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT:    buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT:    buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:60
+; ALIGNED-NEXT:    buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:61
+; ALIGNED-NEXT:    buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:62
+; ALIGNED-NEXT:    buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:63
+; ALIGNED-NEXT:    buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:64
+; ALIGNED-NEXT:    buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:65
+; ALIGNED-NEXT:    buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:66
+; ALIGNED-NEXT:    buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:59
+; ALIGNED-NEXT:    buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:67
+; ALIGNED-NEXT:    buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:68
+; ALIGNED-NEXT:    buffer_load_ubyte v64, v4, s[0:3], 0 offen offset:69
+; ALIGNED-NEXT:    buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:70
+; ALIGNED-NEXT:    buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:71
+; ALIGNED-NEXT:    buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:76
+; ALIGNED-NEXT:    buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:77
+; ALIGNED-NEXT:    buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:78
+; ALIGNED-NEXT:    buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:79
+; ALIGNED-NEXT:    buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT:    buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT:    buffer_load_ubyte v125, v4, s[0:3], 0 offen offset:151
+; ALIGNED-NEXT:    s_waitcnt vmcnt(58)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(57)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(56)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(55)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(54)
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(53)
+; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(52)
+; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(51)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(50)
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(49)
+; ALIGNED-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v2
+; ALIGNED-NEXT:    s_waitcnt vmcnt(46)
+; ALIGNED-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(45)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(44)
+; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v2, v9, 8, v5
+; ALIGNED-NEXT:    s_waitcnt vmcnt(42)
+; ALIGNED-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v8, 8, v6
+; ALIGNED-NEXT:    v_lshl_or_b32 v5, v10, 8, v7
+; ALIGNED-NEXT:    v_lshl_or_b32 v6, v11, 8, v12
+; ALIGNED-NEXT:    v_lshl_or_b32 v7, v15, 8, v14
+; ALIGNED-NEXT:    v_lshl_or_b32 v8, v19, 8, v17
+; ALIGNED-NEXT:    s_waitcnt vmcnt(41)
+; ALIGNED-NEXT:    v_lshl_or_b32 v9, v16, 8, v13
+; ALIGNED-NEXT:    s_waitcnt vmcnt(39)
+; ALIGNED-NEXT:    v_lshl_or_b32 v10, v20, 8, v18
+; ALIGNED-NEXT:    s_waitcnt vmcnt(37)
+; ALIGNED-NEXT:    v_lshl_or_b32 v11, v23, 8, v22
+; ALIGNED-NEXT:    s_waitcnt vmcnt(35)
+; ALIGNED-NEXT:    v_lshl_or_b32 v12, v28, 8, v25
+; ALIGNED-NEXT:    s_waitcnt vmcnt(33)
+; ALIGNED-NEXT:    v_lshl_or_b32 v13, v24, 8, v21
+; ALIGNED-NEXT:    s_waitcnt vmcnt(31)
+; ALIGNED-NEXT:    v_lshl_or_b32 v14, v27, 8, v26
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
+; ALIGNED-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
+; ALIGNED-NEXT:    v_lshl_or_b32 v5, v10, 16, v9
+; ALIGNED-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
+; ALIGNED-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
+; ALIGNED-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(29)
+; ALIGNED-NEXT:    v_lshl_or_b32 v15, v31, 8, v30
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(27)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v34, 8, v33
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(25)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v37, 8, v32
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(23)
+; ALIGNED-NEXT:    v_lshl_or_b32 v2, v36, 8, v35
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(18)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v50, 8, v38
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(16)
+; ALIGNED-NEXT:    v_lshl_or_b32 v5, v49, 8, v39
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v6, v51, 8, v48
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(12)
+; ALIGNED-NEXT:    v_lshl_or_b32 v7, v53, 8, v52
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v0, 16, v15
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; ALIGNED-NEXT:    v_lshl_or_b32 v2, v5, 16, v3
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v7, 16, v6
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v55, 8, v29
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(12)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v67, 8, v66
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
+; ALIGNED-NEXT:    v_lshl_or_b32 v2, v64, 8, v54
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(8)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v68, 8, v65
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:86
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:82
+; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(8)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v70, 8, v69
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:74
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(8)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v80, 8, v71
+; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:73
+; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:72
+; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(9)
+; ALIGNED-NEXT:    buffer_store_dword v126, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(8)
+; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v81, 8, v2
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:81
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:80
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:98
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:102
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:103
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v2
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:94
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:95
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:93
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:91
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:92
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v5, 8, v2
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:90
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:101
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:89
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:88
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v2
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:99
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:100
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:97
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:96
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:114
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:118
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:119
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v2
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:110
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:111
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:109
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:107
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:108
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v5, 8, v2
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:106
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:117
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:105
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:104
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v2
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:115
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:116
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:113
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:112
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:130
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:134
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:135
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v2
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:126
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:127
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:125
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:123
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:124
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v5, 8, v2
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:122
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:133
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:121
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:120
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1244 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1236 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v2
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:131
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:132
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:129
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:128
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1256 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:146
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:150
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v2
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:142
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:143
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:141
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:139
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:140
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v5, 8, v2
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:138
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:149
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:137
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:136
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v2
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:147
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:148
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:145
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:144
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v125, 8, v6
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v2
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:156
+; ALIGNED-NEXT:    buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:157
+; ALIGNED-NEXT:    buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT:    buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT:    buffer_load_ubyte v106, v4, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v111, 8, v122
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v109, 8, v120
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT:    buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT:    buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v94, 8, v105
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v106, 8, v92
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT:    buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT:    buffer_load_ubyte v73, v4, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT:    buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT:    buffer_load_ubyte v88, v4, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT:    buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT:    buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT:    buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v79, 8, v89
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v74, 8, v73
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v72, 8, v77
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v75, 8, v88
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT:    buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT:    buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT:    buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT:    buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v61, 8, v63
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v60, 8, v62
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT:    buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT:    buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v56, 8, v59
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v58, 8, v47
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT:    buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT:    buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT:    buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT:    buffer_load_ubyte v45, v4, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT:    buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT:    buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT:    buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v44, 8, v46
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v40, 8, v119
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v118, 8, v42
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v41, 8, v45
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT:    buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT:    buffer_load_ubyte v116, v4, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT:    buffer_load_ubyte v114, v4, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT:    buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v115, 8, v117
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v114, 8, v116
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT:    buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT:    buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v103, 8, v113
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v112, 8, v102
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT:    buffer_load_ubyte v98, v4, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT:    buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT:    buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT:    buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT:    buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT:    buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT:    buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    v_lshl_or_b32 v2, v98, 8, v100
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v86, 8, v87
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    v_lshl_or_b32 v2, v97, 8, v99
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v85, 8, v96
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT:    buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT:    buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT:    buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT:    buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v2, v81, 8, v83
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v80, 8, v82
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT:    buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT:    buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v2, v69, 8, v70
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v71, 8, v68
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT:    buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT:    buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT:    buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT:    buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v2, v54, 8, v67
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v52, 8, v65
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT:    buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT:    buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT:    buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT:    buffer_load_ubyte v64, v4, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT:    buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT:    buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT:    buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    v_lshl_or_b32 v2, v53, 8, v66
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v48, 8, v49
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    v_lshl_or_b32 v2, v51, 8, v64
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v39, 8, v50
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT:    buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT:    buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v2, v36, 8, v38
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v55, 8, v37
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT:    buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT:    buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT:    buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT:    buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT:    buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT:    buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT:    buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    v_lshl_or_b32 v2, v33, 8, v35
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v30, 8, v29
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    v_lshl_or_b32 v2, v31, 8, v34
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v28, 8, v32
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x17
+; ALIGNED-NEXT:    buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT:    buffer_load_ubyte v25, v4, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT:    buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT:    buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT:    buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT:    buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT:    buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT:    buffer_load_ubyte v20, v4, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT:    buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT:    buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT:    buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT:    buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT:    buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT:    buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT:    buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT:    buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:247
+; ALIGNED-NEXT:    buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT:    buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT:    buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:251
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT:    v_lshl_or_b32 v123, v3, 16, v2
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen
+; ALIGNED-NEXT:    s_waitcnt vmcnt(23)
+; ALIGNED-NEXT:    v_lshl_or_b32 v2, v25, 8, v27
+; ALIGNED-NEXT:    s_waitcnt vmcnt(21)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v24, 8, v26
+; ALIGNED-NEXT:    s_waitcnt vmcnt(9)
+; ALIGNED-NEXT:    v_lshl_or_b32 v43, v12, 8, v16
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    v_lshl_or_b32 v57, v8, 8, v10
+; ALIGNED-NEXT:    v_lshl_or_b32 v104, v3, 16, v2
+; ALIGNED-NEXT:    v_lshl_or_b32 v2, v21, 8, v22
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v23, 8, v20
+; ALIGNED-NEXT:    v_lshl_or_b32 v76, v3, 16, v2
+; ALIGNED-NEXT:    v_lshl_or_b32 v2, v17, 8, v19
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v14, 8, v13
+; ALIGNED-NEXT:    v_lshl_or_b32 v101, v3, 16, v2
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v15, 8, v18
+; ALIGNED-NEXT:    v_lshl_or_b32 v84, v43, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v43, v9, 8, v11
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v57, 16, v43
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    v_lshl_or_b32 v43, v5, 8, v6
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v57, v7, 8, v1
+; ALIGNED-NEXT:    v_lshl_or_b32 v2, v57, 16, v43
+; ALIGNED-NEXT:    buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:1
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x5
+; ALIGNED-NEXT:    buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT:    buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT:    buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT:    buffer_load_ubyte v90, v4, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT:    buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT:    buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT:    v_lshl_or_b32 v43, v43, 8, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v57, v57, 8, v127
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v78, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v90, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v91, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v57, 16, v43
+; ALIGNED-NEXT:    v_lshl_or_b32 v43, v90, 8, v78
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v57, v124, 8, v91
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v57, 16, v43
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT:    buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT:    buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT:    buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT:    buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:11
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v43, v107, 8, v121
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v57, v108, 8, v110
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v57, 16, v43
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT:    buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT:    buffer_load_ubyte v90, v4, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v43, v91, 8, v95
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v57, v93, 8, v90
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v57, 16, v43
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:16
+; ALIGNED-NEXT:    buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:18
+; ALIGNED-NEXT:    buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:488
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:492
+; ALIGNED-NEXT:    buffer_store_dword v84, off, s[0:3], s32 offset:484
+; ALIGNED-NEXT:    buffer_store_dword v101, off, s[0:3], s32 offset:480
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:704
+; ALIGNED-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:708
+; ALIGNED-NEXT:    v_add_nc_u32_e32 v4, 0xffffff00, v4
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v126, 8, v43
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    v_lshl_or_b32 v126, v57, 8, v78
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_add_co_u32 v2, vcc_lo, v2, s4
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s5, v3, vcc_lo
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v1 offset:250
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v7 offset:251
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v5 offset:249
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v8 offset:255
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v9 offset:253
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v10 offset:254
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v11 offset:252
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v6 offset:248
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v13 offset:242
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v14 offset:243
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v17 offset:241
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v12 offset:247
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v15 offset:245
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v16 offset:246
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v18 offset:244
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v19 offset:240
+; ALIGNED-NEXT:    buffer_store_dword v76, off, s[0:3], s32 offset:504
+; ALIGNED-NEXT:    buffer_store_dword v104, off, s[0:3], s32 offset:508
+; ALIGNED-NEXT:    buffer_store_dword v123, off, s[0:3], s32 offset:500
+; ALIGNED-NEXT:    v_lshl_or_b32 v126, v0, 16, v126
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_add_u32 s4, s4, 0xffffff00
+; ALIGNED-NEXT:    s_addc_u32 s5, s5, -1
+; ALIGNED-NEXT:    s_cmp_eq_u64 s[4:5], s[6:7]
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:496
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v20 offset:234
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v23 offset:235
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v21 offset:233
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v24 offset:239
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v25 offset:237
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v26 offset:238
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v27 offset:236
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v22 offset:232
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v29 offset:226
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v30 offset:227
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v33 offset:225
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v28 offset:231
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v31 offset:229
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v32 offset:230
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v34 offset:228
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v35 offset:224
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:448
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:460
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:456
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:452
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v54 offset:213
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v52 offset:215
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v36 offset:209
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v55 offset:211
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v37 offset:210
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v65 offset:214
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v67 offset:212
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v49 offset:218
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v48 offset:219
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v53 offset:217
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v39 offset:223
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v51 offset:221
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v50 offset:222
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v64 offset:220
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v66 offset:216
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v38 offset:208
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:472
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:476
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:468
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:464
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v68 offset:202
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v71 offset:203
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v69 offset:201
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v80 offset:207
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v81 offset:205
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v82 offset:206
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v83 offset:204
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v70 offset:200
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v87 offset:194
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v86 offset:195
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v98 offset:193
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v85 offset:199
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v97 offset:197
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v96 offset:198
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v99 offset:196
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v100 offset:192
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:552
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:548
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:544
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v102 offset:186
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v112 offset:187
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v103 offset:185
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v114 offset:191
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v115 offset:189
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v116 offset:190
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v117 offset:188
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v113 offset:184
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v119 offset:178
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v40 offset:179
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v44 offset:177
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v118 offset:183
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v41 offset:181
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v42 offset:182
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v45 offset:180
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v46 offset:176
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v47 offset:170
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v58 offset:171
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v56 offset:169
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v60 offset:175
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v61 offset:173
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v62 offset:174
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v63 offset:172
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v59 offset:168
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v73 offset:162
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v74 offset:163
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v79 offset:161
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v72 offset:167
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v75 offset:165
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v77 offset:166
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v88 offset:164
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v89 offset:160
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:520
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:524
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:516
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:512
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v92 offset:154
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v106 offset:155
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v94 offset:153
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v109 offset:159
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v111 offset:157
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v120 offset:158
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v122 offset:156
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v105 offset:152
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:146
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:147
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:145
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v125 offset:151
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:149
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:150
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:148
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:144
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:536
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:540
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:532
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:528
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:138
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:139
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:137
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:143
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:141
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:142
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:140
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:136
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:130
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:131
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1256 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:129
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:135
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:133
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:134
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:132
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:128
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1244 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:122
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1240 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:123
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1236 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:121
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:127
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:125
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:126
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:124
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:120
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:114
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:115
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:113
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:119
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:117
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:118
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:116
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:112
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:632
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:636
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:628
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:624
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:106
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:107
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:105
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:111
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:109
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:110
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:108
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:104
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:98
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:99
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:97
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:103
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:101
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:102
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:100
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:96
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:584
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:588
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:580
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:90
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:91
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:89
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:95
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:93
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:94
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:92
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:88
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:82
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:83
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:81
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:87
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:85
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:86
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:84
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:80
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:600
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:596
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:592
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:74
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:75
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:73
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:79
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:77
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:78
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:76
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:72
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:66
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:67
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:65
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:71
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:69
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:70
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:68
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:64
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:684
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:676
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:672
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:61
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:58
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:59
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:57
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:63
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:62
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:60
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:56
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:53
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:50
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:51
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:49
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:55
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:54
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:52
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:48
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:700
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:696
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:692
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:688
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:43
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:42
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:41
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:40
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:47
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:46
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:45
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:44
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:35
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:34
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:33
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:32
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:39
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:38
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:37
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:36
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:648
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:644
+; ALIGNED-NEXT:    buffer_store_dword v126, off, s[0:3], s32 offset:640
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:26
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:27
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:25
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:31
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:29
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:30
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:28
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:24
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v43 offset:18
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:19
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v57 offset:17
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:23
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:21
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:22
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:20
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v78 offset:16
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:664
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:668
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:660
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:656
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v90 offset:10
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v93 offset:11
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v107 offset:13
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v91 offset:9
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v108 offset:15
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v110 offset:14
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v121 offset:12
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v95 offset:8
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v127 offset:2
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:3
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:1
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v124 offset:7
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:5
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:6
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:4
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v0
+; ALIGNED-NEXT:    s_cbranch_scc0 .LBB9_4
+; ALIGNED-NEXT:  .LBB9_5: ; %Flow11
+; ALIGNED-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; ALIGNED-NEXT:    s_clause 0x2f
+; ALIGNED-NEXT:    buffer_load_dword v127, off, s[0:3], s32
+; ALIGNED-NEXT:    buffer_load_dword v126, off, s[0:3], s32 offset:4
+; ALIGNED-NEXT:    buffer_load_dword v125, off, s[0:3], s32 offset:8
+; ALIGNED-NEXT:    buffer_load_dword v124, off, s[0:3], s32 offset:12
+; ALIGNED-NEXT:    buffer_load_dword v123, off, s[0:3], s32 offset:16
+; ALIGNED-NEXT:    buffer_load_dword v122, off, s[0:3], s32 offset:20
+; ALIGNED-NEXT:    buffer_load_dword v121, off, s[0:3], s32 offset:24
+; ALIGNED-NEXT:    buffer_load_dword v120, off, s[0:3], s32 offset:28
+; ALIGNED-NEXT:    buffer_load_dword v111, off, s[0:3], s32 offset:32
+; ALIGNED-NEXT:    buffer_load_dword v110, off, s[0:3], s32 offset:36
+; ALIGNED-NEXT:    buffer_load_dword v109, off, s[0:3], s32 offset:40
+; ALIGNED-NEXT:    buffer_load_dword v108, off, s[0:3], s32 offset:44
+; ALIGNED-NEXT:    buffer_load_dword v107, off, s[0:3], s32 offset:48
+; ALIGNED-NEXT:    buffer_load_dword v106, off, s[0:3], s32 offset:52
+; ALIGNED-NEXT:    buffer_load_dword v105, off, s[0:3], s32 offset:56
+; ALIGNED-NEXT:    buffer_load_dword v104, off, s[0:3], s32 offset:60
+; ALIGNED-NEXT:    buffer_load_dword v95, off, s[0:3], s32 offset:64
+; ALIGNED-NEXT:    buffer_load_dword v94, off, s[0:3], s32 offset:68
+; ALIGNED-NEXT:    buffer_load_dword v93, off, s[0:3], s32 offset:72
+; ALIGNED-NEXT:    buffer_load_dword v92, off, s[0:3], s32 offset:76
+; ALIGNED-NEXT:    buffer_load_dword v91, off, s[0:3], s32 offset:80
+; ALIGNED-NEXT:    buffer_load_dword v90, off, s[0:3], s32 offset:84
+; ALIGNED-NEXT:    buffer_load_dword v89, off, s[0:3], s32 offset:88
+; ALIGNED-NEXT:    buffer_load_dword v88, off, s[0:3], s32 offset:92
+; ALIGNED-NEXT:    buffer_load_dword v79, off, s[0:3], s32 offset:96
+; ALIGNED-NEXT:    buffer_load_dword v78, off, s[0:3], s32 offset:100
+; ALIGNED-NEXT:    buffer_load_dword v77, off, s[0:3], s32 offset:104
+; ALIGNED-NEXT:    buffer_load_dword v76, off, s[0:3], s32 offset:108
+; ALIGNED-NEXT:    buffer_load_dword v75, off, s[0:3], s32 offset:112
+; ALIGNED-NEXT:    buffer_load_dword v74, off, s[0:3], s32 offset:116
+; ALIGNED-NEXT:    buffer_load_dword v73, off, s[0:3], s32 offset:120
+; ALIGNED-NEXT:    buffer_load_dword v72, off, s[0:3], s32 offset:124
+; ALIGNED-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:128
+; ALIGNED-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:132
+; ALIGNED-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:136
+; ALIGNED-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:140
+; ALIGNED-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:144
+; ALIGNED-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:148
+; ALIGNED-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:152
+; ALIGNED-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:156
+; ALIGNED-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:160
+; ALIGNED-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:164
+; ALIGNED-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:168
+; ALIGNED-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:172
+; ALIGNED-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:176
+; ALIGNED-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:180
+; ALIGNED-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:184
+; ALIGNED-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:188
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNROLL3-LABEL: memmove_p0_p5_sz2048:
+; UNROLL3:       ; %bb.0: ; %entry
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNROLL3-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; UNROLL3-NEXT:    s_mov_b64 s[4:5], 0
+; UNROLL3-NEXT:    s_mov_b32 s6, exec_lo
+; UNROLL3-NEXT:    v_cndmask_b32_e32 v3, -1, v0, vcc_lo
+; UNROLL3-NEXT:    v_cmpx_ge_u32_e64 v2, v3
+; UNROLL3-NEXT:    s_xor_b32 s6, exec_lo, s6
+; UNROLL3-NEXT:    s_cbranch_execz .LBB9_4
+; UNROLL3-NEXT:  ; %bb.1: ; %memmove_fwd_loop.preheader
+; UNROLL3-NEXT:    v_mov_b32_e32 v3, v2
+; UNROLL3-NEXT:    s_inst_prefetch 0x1
+; UNROLL3-NEXT:    .p2align 6
+; UNROLL3-NEXT:  .LBB9_2: ; %memmove_fwd_loop
+; UNROLL3-NEXT:    ; =>This Inner Loop Header: Depth=1
+; UNROLL3-NEXT:    s_clause 0xb
+; UNROLL3-NEXT:    buffer_load_dword v4, v3, s[0:3], 0 offen
+; UNROLL3-NEXT:    buffer_load_dword v5, v3, s[0:3], 0 offen offset:4
+; UNROLL3-NEXT:    buffer_load_dword v6, v3, s[0:3], 0 offen offset:8
+; UNROLL3-NEXT:    buffer_load_dword v7, v3, s[0:3], 0 offen offset:12
+; UNROLL3-NEXT:    buffer_load_dword v8, v3, s[0:3], 0 offen offset:16
+; UNROLL3-NEXT:    buffer_load_dword v9, v3, s[0:3], 0 offen offset:20
+; UNROLL3-NEXT:    buffer_load_dword v10, v3, s[0:3], 0 offen offset:24
+; UNROLL3-NEXT:    buffer_load_dword v11, v3, s[0:3], 0 offen offset:28
+; UNROLL3-NEXT:    buffer_load_dword v12, v3, s[0:3], 0 offen offset:32
+; UNROLL3-NEXT:    buffer_load_dword v13, v3, s[0:3], 0 offen offset:36
+; UNROLL3-NEXT:    buffer_load_dword v14, v3, s[0:3], 0 offen offset:40
+; UNROLL3-NEXT:    buffer_load_dword v15, v3, s[0:3], 0 offen offset:44
+; UNROLL3-NEXT:    v_add_co_u32 v16, vcc_lo, v0, s4
+; UNROLL3-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, s5, v1, vcc_lo
+; UNROLL3-NEXT:    s_add_u32 s4, s4, 48
+; UNROLL3-NEXT:    v_add_nc_u32_e32 v3, 48, v3
+; UNROLL3-NEXT:    s_addc_u32 s5, s5, 0
+; UNROLL3-NEXT:    s_waitcnt vmcnt(4)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[8:11] offset:16
+; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[4:7]
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[16:17], v[12:15] offset:32
+; UNROLL3-NEXT:    s_cmp_lg_u64 s[4:5], 0x7e0
+; UNROLL3-NEXT:    s_cbranch_scc1 .LBB9_2
+; UNROLL3-NEXT:  ; %bb.3: ; %memmove_fwd_residual
+; UNROLL3-NEXT:    s_inst_prefetch 0x2
+; UNROLL3-NEXT:    s_clause 0x3
+; UNROLL3-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:2016
+; UNROLL3-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:2020
+; UNROLL3-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:2024
+; UNROLL3-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:2028
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:2016
+; UNROLL3-NEXT:    s_clause 0x3
+; UNROLL3-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:2032
+; UNROLL3-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:2036
+; UNROLL3-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:2040
+; UNROLL3-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:2044
+; UNROLL3-NEXT:    ; implicit-def: $vgpr2
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:2032
+; UNROLL3-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; UNROLL3-NEXT:  .LBB9_4: ; %Flow8
+; UNROLL3-NEXT:    s_andn2_saveexec_b32 s8, s6
+; UNROLL3-NEXT:    s_cbranch_execz .LBB9_7
+; UNROLL3-NEXT:  ; %bb.5: ; %memmove_bwd_residual
+; UNROLL3-NEXT:    s_clause 0x3
+; UNROLL3-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:2032
+; UNROLL3-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:2036
+; UNROLL3-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:2040
+; UNROLL3-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:2044
+; UNROLL3-NEXT:    s_movk_i32 s6, 0xffd0
+; UNROLL3-NEXT:    s_mov_b64 s[4:5], 0x7b0
+; UNROLL3-NEXT:    s_mov_b32 s7, -1
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:2032
+; UNROLL3-NEXT:    s_clause 0x3
+; UNROLL3-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:2016
+; UNROLL3-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:2020
+; UNROLL3-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:2024
+; UNROLL3-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:2028
+; UNROLL3-NEXT:    v_add_nc_u32_e32 v2, 0x7b0, v2
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:2016
+; UNROLL3-NEXT:    s_inst_prefetch 0x1
+; UNROLL3-NEXT:    .p2align 6
+; UNROLL3-NEXT:  .LBB9_6: ; %memmove_bwd_loop
+; UNROLL3-NEXT:    ; =>This Inner Loop Header: Depth=1
+; UNROLL3-NEXT:    s_clause 0xb
+; UNROLL3-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; UNROLL3-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; UNROLL3-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; UNROLL3-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; UNROLL3-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; UNROLL3-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; UNROLL3-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; UNROLL3-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
+; UNROLL3-NEXT:    buffer_load_dword v11, v2, s[0:3], 0 offen offset:32
+; UNROLL3-NEXT:    buffer_load_dword v12, v2, s[0:3], 0 offen offset:36
+; UNROLL3-NEXT:    buffer_load_dword v13, v2, s[0:3], 0 offen offset:40
+; UNROLL3-NEXT:    buffer_load_dword v14, v2, s[0:3], 0 offen offset:44
+; UNROLL3-NEXT:    v_add_co_u32 v15, vcc_lo, v0, s4
+; UNROLL3-NEXT:    v_add_co_ci_u32_e32 v16, vcc_lo, s5, v1, vcc_lo
+; UNROLL3-NEXT:    v_subrev_nc_u32_e32 v2, 48, v2
+; UNROLL3-NEXT:    s_add_u32 s4, s4, 0xffffffd0
+; UNROLL3-NEXT:    s_addc_u32 s5, s5, -1
+; UNROLL3-NEXT:    s_waitcnt vmcnt(4)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[15:16], v[7:10] offset:16
+; UNROLL3-NEXT:    flat_store_dwordx4 v[15:16], v[3:6]
+; UNROLL3-NEXT:    s_waitcnt vmcnt(0)
+; UNROLL3-NEXT:    flat_store_dwordx4 v[15:16], v[11:14] offset:32
+; UNROLL3-NEXT:    s_cmp_eq_u64 s[4:5], s[6:7]
+; UNROLL3-NEXT:    s_cbranch_scc0 .LBB9_6
+; UNROLL3-NEXT:  .LBB9_7: ; %Flow9
+; UNROLL3-NEXT:    s_inst_prefetch 0x2
+; UNROLL3-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; UNROLL3-NEXT:    s_waitcnt lgkmcnt(0)
+; UNROLL3-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  tail call void @llvm.memmove.p0.p5.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 2048, i1 false)
+  ret void
+}
+
+
+declare void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noalias nocapture writeonly, ptr addrspace(0) noalias nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i64, i1 immarg) #2
+
+declare void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i64, i1 immarg) #2
+
+declare void @llvm.memmove.p0.p0.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memmove.p1.p1.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memmove.p0.p4.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memmove.p5.p5.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #2
+
+declare void @llvm.memmove.p0.p5.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #2
+
+attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }


        


More information about the llvm-commits mailing list