[llvm] Enable generic overlapping optimization for memmove (PR #177885)

Osama Abdelkader via llvm-commits llvm-commits at lists.llvm.org
Sun Jan 25 16:10:07 PST 2026


https://github.com/osamakader updated https://github.com/llvm/llvm-project/pull/177885

>From 2635442658975051a1332bc2a3e5911c7f27029c Mon Sep 17 00:00:00 2001
From: Osama Abdelkader <osama.abdelkader at gmail.com>
Date: Sun, 25 Jan 2026 22:40:15 +0100
Subject: [PATCH 1/2] Enable generic overlapping optimization for memmove

This change enables memmove to use the same generic overlapping load/store
optimization that memcpy uses, instead of requiring target-specific code.

Changes:
1. Use isVol instead of hardcoding IsVolatile=true in getMemmoveLoadsAndStores.
   This allows allowOverlap()=true for non-volatile memmove, enabling
   findOptimalMemOpLowering to generate overlapping MemOps.

2. Add overlapping load/store handling to memmove, matching memcpy's
   implementation. This adjusts offsets when the last MemOp is larger than
   the remaining size, creating overlapping loads/stores for non-power-of-two
   sizes.

Fixes: #165948
Signed-off-by: Osama Abdelkader <osama.abdelkader at gmail.com>
---
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 16 ++-
 llvm/test/CodeGen/AArch64/memmove-inline.ll   | 98 +++++++++++++++++++
 2 files changed, 112 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 4ca1bb053fce5..cf7eb82ba5ed0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8966,8 +8966,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
   unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemmove(OptSize);
   if (!TLI.findOptimalMemOpLowering(
           C, MemOps, Limit,
-          MemOp::Copy(Size, DstAlignCanChange, Alignment, *SrcAlign,
-                      /*IsVolatile*/ true),
+          MemOp::Copy(Size, DstAlignCanChange, Alignment, *SrcAlign, isVol),
           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
           MF.getFunction().getAttributes()))
     return SDValue();
@@ -9008,6 +9007,12 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
     unsigned VTSize = VT.getSizeInBits() / 8;
     SDValue Value;
 
+    if (i == NumMemOps - 1 && i != 0 && VTSize > Size - SrcOff) {
+      // Issuing an unaligned load / store pair that overlaps with the previous
+      // pair. Adjust the offset accordingly.
+      SrcOff -= VTSize - (Size - SrcOff);
+    }
+
     bool isDereferenceable =
         SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
     MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
@@ -9024,11 +9029,18 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
   }
   Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
   OutChains.clear();
+  DstOff = 0;
   for (unsigned i = 0; i < NumMemOps; i++) {
     EVT VT = MemOps[i];
     unsigned VTSize = VT.getSizeInBits() / 8;
     SDValue Store;
 
+    if (i == NumMemOps - 1 && i != 0 && VTSize > Size - DstOff) {
+      // Issuing an unaligned load / store pair that overlaps with the previous
+      // pair. Adjust the offset accordingly.
+      DstOff -= VTSize - (Size - DstOff);
+    }
+
     Store = DAG.getStore(
         Chain, dl, LoadValues[i],
         DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
diff --git a/llvm/test/CodeGen/AArch64/memmove-inline.ll b/llvm/test/CodeGen/AArch64/memmove-inline.ll
index 641c48dd0f1c5..4ca180616442d 100644
--- a/llvm/test/CodeGen/AArch64/memmove-inline.ll
+++ b/llvm/test/CodeGen/AArch64/memmove-inline.ll
@@ -120,3 +120,101 @@ entry:
 }
 
 declare void @llvm.memmove.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1)
+
+; Test overlapping memmove optimization for non-power-of-two sizes
+; These should use overlapping loads/stores instead of mixed-size operations
+
+define void @move7(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move7:
+; CHECK-ALIGNED:       // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT:    ldur w8, [x1, #3]
+; CHECK-ALIGNED-NEXT:    ldr w9, [x1]
+; CHECK-ALIGNED-NEXT:    stur w8, [x0, #3]
+; CHECK-ALIGNED-NEXT:    str w9, [x0]
+; CHECK-ALIGNED-NEXT:    ret
+entry:
+  call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 7, i1 false)
+  ret void
+}
+
+define void @move13(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move13:
+; CHECK-ALIGNED:       // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT:    ldur x8, [x1, #5]
+; CHECK-ALIGNED-NEXT:    ldr x9, [x1]
+; CHECK-ALIGNED-NEXT:    stur x8, [x0, #5]
+; CHECK-ALIGNED-NEXT:    str x9, [x0]
+; CHECK-ALIGNED-NEXT:    ret
+entry:
+  call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 13, i1 false)
+  ret void
+}
+
+define void @move15(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move15:
+; CHECK-ALIGNED:       // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT:    ldur x8, [x1, #7]
+; CHECK-ALIGNED-NEXT:    ldr x9, [x1]
+; CHECK-ALIGNED-NEXT:    stur x8, [x0, #7]
+; CHECK-ALIGNED-NEXT:    str x9, [x0]
+; CHECK-ALIGNED-NEXT:    ret
+entry:
+  call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 15, i1 false)
+  ret void
+}
+
+define void @move25(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move25:
+; CHECK-ALIGNED:       // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT:    ldur q0, [x1, #9]
+; CHECK-ALIGNED-NEXT:    ldr q1, [x1]
+; CHECK-ALIGNED-NEXT:    stur q0, [x0, #9]
+; CHECK-ALIGNED-NEXT:    str q1, [x0]
+; CHECK-ALIGNED-NEXT:    ret
+entry:
+  call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 25, i1 false)
+  ret void
+}
+
+define void @move33(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move33:
+; CHECK-ALIGNED:       // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT:    ldp q1, q0, [x1]
+; CHECK-ALIGNED-NEXT:    ldrb w8, [x1, #32]
+; CHECK-ALIGNED-NEXT:    strb w8, [x0, #32]
+; CHECK-ALIGNED-NEXT:    stp q1, q0, [x0]
+; CHECK-ALIGNED-NEXT:    ret
+entry:
+  call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 33, i1 false)
+  ret void
+}
+
+define void @move49(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move49:
+; CHECK-ALIGNED:       // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT:    ldp q2, q0, [x1, #16]
+; CHECK-ALIGNED-NEXT:    ldrb w8, [x1, #48]
+; CHECK-ALIGNED-NEXT:    ldr q1, [x1]
+; CHECK-ALIGNED-NEXT:    strb w8, [x0, #48]
+; CHECK-ALIGNED-NEXT:    stp q2, q0, [x0, #16]
+; CHECK-ALIGNED-NEXT:    str q1, [x0]
+; CHECK-ALIGNED-NEXT:    ret
+entry:
+  call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 49, i1 false)
+  ret void
+}
+
+define void @move65(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move65:
+; CHECK-ALIGNED:       // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT:    ldp q0, q1, [x1, #32]
+; CHECK-ALIGNED-NEXT:    ldrb w8, [x1, #64]
+; CHECK-ALIGNED-NEXT:    ldp q2, q3, [x1]
+; CHECK-ALIGNED-NEXT:    strb w8, [x0, #64]
+; CHECK-ALIGNED-NEXT:    stp q0, q1, [x0, #32]
+; CHECK-ALIGNED-NEXT:    stp q2, q3, [x0]
+; CHECK-ALIGNED-NEXT:    ret
+entry:
+  call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 65, i1 false)
+  ret void
+}

>From 902ba020f0b921f65d65a4a6751954b01869469a Mon Sep 17 00:00:00 2001
From: Osama Abdelkader <osama.abdelkader at gmail.com>
Date: Mon, 26 Jan 2026 00:25:10 +0100
Subject: [PATCH 2/2] Update test expectations for memmove overlapping
 optimization

Signed-off-by: Osama Abdelkader <osama.abdelkader at gmail.com>
---
 .../AMDGPU/memmove-param-combinations.ll      | 1617 ++++++-----------
 .../CodeGen/AMDGPU/memmove-scalar-load.ll     |   22 +-
 llvm/test/CodeGen/RISCV/memmove.ll            |  344 ++--
 ...ile-memstores-nooverlapping-load-stores.ll |    6 +-
 4 files changed, 830 insertions(+), 1159 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
index 01b7f40f6256f..b59c55136a3ef 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
@@ -27,19 +27,16 @@ define void @memmove_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
-; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[8:9], v[2:3] offset:23
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[8:9] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -83,19 +80,16 @@ define void @memmove_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
-; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[8:9], v[2:3] offset:23
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[8:9] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -139,21 +133,13 @@ define void @memmove_p0_p0_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p0_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x4
-; CHECK-NEXT:    flat_load_dword v8, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3]
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_dword v9, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    flat_store_dword v[0:1], v8 offset:16
-; CHECK-NEXT:    flat_load_dword v8, v[2:3] offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:30
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(2)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[8:9] offset:20
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:15
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -197,22 +183,13 @@ define void @memmove_p0_p0_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr
 ; CHECK-LABEL: memmove_p0_p0_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_dword v9, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    flat_store_dword v[0:1], v9 offset:16
-; CHECK-NEXT:    flat_load_dword v9, v[2:3] offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_dword v[0:1], v9 offset:20
-; CHECK-NEXT:    flat_load_dword v2, v[2:3] offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:30
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(2)
-; CHECK-NEXT:    flat_store_dword v[0:1], v2 offset:24
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:15
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -256,19 +233,16 @@ define void @memmove_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p1_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
-; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[8:9] offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -312,19 +286,16 @@ define void @memmove_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p1_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
-; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[8:9] offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -368,21 +339,13 @@ define void @memmove_p0_p1_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p1_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x4
-; CHECK-NEXT:    global_load_dword v8, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_dword v9, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_dword v[0:1], v8 offset:16
-; CHECK-NEXT:    global_load_dword v8, v[2:3], off offset:20
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:15
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:30
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[8:9] offset:20
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -426,22 +389,13 @@ define void @memmove_p0_p1_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr
 ; CHECK-LABEL: memmove_p0_p1_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_dword v9, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_dword v[0:1], v9 offset:16
-; CHECK-NEXT:    global_load_dword v9, v[2:3], off offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dword v[0:1], v9 offset:20
-; CHECK-NEXT:    global_load_dword v2, v[2:3], off offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:30
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:15
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dword v[0:1], v2 offset:24
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -485,19 +439,15 @@ define void @memmove_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_b32 v8, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
-; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
-; CHECK-NEXT:    ds_read_b64 v[6:7], v2 offset:16
-; CHECK-NEXT:    ds_read_b128 v[2:5], v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    ds_read_b64 v[7:8], v2 offset:23
+; CHECK-NEXT:    ds_read_b128 v[3:6], v2
+; CHECK-NEXT:    ds_read_b64 v[9:10], v2 offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[7:8] offset:23
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[9:10] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -540,19 +490,15 @@ define void @memmove_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_b32 v8, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
-; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
-; CHECK-NEXT:    ds_read_b64 v[6:7], v2 offset:16
-; CHECK-NEXT:    ds_read_b128 v[2:5], v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    ds_read_b64 v[7:8], v2 offset:23
+; CHECK-NEXT:    ds_read_b128 v[3:6], v2
+; CHECK-NEXT:    ds_read_b64 v[9:10], v2 offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[7:8] offset:23
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[9:10] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -595,19 +541,12 @@ define void @memmove_p0_p3_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read2_b32 v[6:7], v2 offset0:4 offset1:5
-; CHECK-NEXT:    ds_read_b32 v8, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
-; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
-; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    ds_read_b128 v[3:6], v2 offset:15
+; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:15
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -650,19 +589,12 @@ define void @memmove_p0_p3_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr
 ; CHECK-LABEL: memmove_p0_p3_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read2_b32 v[6:7], v2 offset0:4 offset1:5
-; CHECK-NEXT:    ds_read_b32 v8, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
-; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
-; CHECK-NEXT:    ds_read_b128 v[2:5], v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    ds_read_b128 v[3:6], v2 offset:15
+; CHECK-NEXT:    ds_read_b128 v[7:10], v2
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:15
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -705,19 +637,16 @@ define void @memmove_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p4_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
-; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[8:9] offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -761,19 +690,16 @@ define void @memmove_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p4_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
-; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[8:9] offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -817,21 +743,13 @@ define void @memmove_p0_p4_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p4_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x4
-; CHECK-NEXT:    global_load_dword v8, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_dword v9, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_dword v[0:1], v8 offset:16
-; CHECK-NEXT:    global_load_dword v8, v[2:3], off offset:20
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:15
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:30
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[8:9] offset:20
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -875,22 +793,13 @@ define void @memmove_p0_p4_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr
 ; CHECK-LABEL: memmove_p0_p4_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_dword v9, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_dword v[0:1], v9 offset:16
-; CHECK-NEXT:    global_load_dword v9, v[2:3], off offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dword v[0:1], v9 offset:20
-; CHECK-NEXT:    global_load_dword v2, v[2:3], off offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:30
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:15
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dword v[0:1], v2 offset:24
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -938,22 +847,20 @@ define void @memmove_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:28
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:30
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[7:9] offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[7:8] offset:23
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[9:10] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1007,22 +914,20 @@ define void @memmove_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:28
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:30
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[7:9] offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[7:8] offset:23
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[9:10] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1076,23 +981,19 @@ define void @memmove_p0_p5_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p5_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[7:9] offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1146,23 +1047,19 @@ define void @memmove_p0_p5_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr
 ; CHECK-LABEL: memmove_p0_p5_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[7:9] offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1211,19 +1108,16 @@ define void @memmove_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
-; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[8:9], v[2:3] offset:23
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -1264,19 +1158,16 @@ define void @memmove_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
-; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[8:9], v[2:3] offset:23
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -1317,21 +1208,13 @@ define void @memmove_p1_p0_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p0_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x4
-; CHECK-NEXT:    flat_load_dword v8, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3]
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_dword v9, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    global_store_dword v[0:1], v8, off offset:16
-; CHECK-NEXT:    flat_load_dword v8, v[2:3] offset:20
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    global_store_byte v[0:1], v11, off offset:30
-; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:20
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -1372,22 +1255,13 @@ define void @memmove_p1_p0_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr
 ; CHECK-LABEL: memmove_p1_p0_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_dword v9, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    global_store_dword v[0:1], v9, off offset:16
-; CHECK-NEXT:    flat_load_dword v9, v[2:3] offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_store_dword v[0:1], v9, off offset:20
-; CHECK-NEXT:    flat_load_dword v2, v[2:3] offset:24
-; CHECK-NEXT:    global_store_byte v[0:1], v8, off offset:30
-; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_store_dword v[0:1], v2, off offset:24
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -1428,19 +1302,16 @@ define void @memmove_p1_p1_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p1_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
-; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -1481,19 +1352,16 @@ define void @memmove_p1_p1_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p1_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
-; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -1534,21 +1402,13 @@ define void @memmove_p1_p1_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p1_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x4
-; CHECK-NEXT:    global_load_dword v8, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_dword v9, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    global_store_dword v[0:1], v8, off offset:16
-; CHECK-NEXT:    global_load_dword v8, v[2:3], off offset:20
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:15
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    global_store_byte v[0:1], v11, off offset:30
-; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:20
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -1589,22 +1449,13 @@ define void @memmove_p1_p1_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr
 ; CHECK-LABEL: memmove_p1_p1_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_dword v9, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    global_store_dword v[0:1], v9, off offset:16
-; CHECK-NEXT:    global_load_dword v9, v[2:3], off offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dword v[0:1], v9, off offset:20
-; CHECK-NEXT:    global_load_dword v2, v[2:3], off offset:24
-; CHECK-NEXT:    global_store_byte v[0:1], v8, off offset:30
-; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:15
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dword v[0:1], v2, off offset:24
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -1647,19 +1498,13 @@ define void @memmove_p1_p3_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    ds_read_b64 v[7:8], v2
 ; CHECK-NEXT:    ds_read_b128 v[3:6], v2 offset:8
-; CHECK-NEXT:    ds_read_b32 v9, v2 offset:24
-; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
-; CHECK-NEXT:    ds_read_u8 v2, v2 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off offset:8
+; CHECK-NEXT:    ds_read_b64 v[9:10], v2 offset:23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    global_store_dword v[0:1], v9, off offset:24
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off offset:8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_store_byte v[0:1], v2, off offset:30
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[9:10], off offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -1701,19 +1546,13 @@ define void @memmove_p1_p3_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    ds_read_b64 v[7:8], v2
 ; CHECK-NEXT:    ds_read_b128 v[3:6], v2 offset:8
-; CHECK-NEXT:    ds_read_b32 v9, v2 offset:24
-; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
-; CHECK-NEXT:    ds_read_u8 v2, v2 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off offset:8
+; CHECK-NEXT:    ds_read_b64 v[9:10], v2 offset:23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    global_store_dword v[0:1], v9, off offset:24
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off offset:8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_store_byte v[0:1], v2, off offset:30
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[9:10], off offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -1754,17 +1593,11 @@ define void @memmove_p1_p3_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset1:1
-; CHECK-NEXT:    ds_read_b32 v7, v2 offset:16
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:30
-; CHECK-NEXT:    ds_read2_b32 v[8:9], v2 offset0:5 offset1:6
-; CHECK-NEXT:    ds_read_u16 v2, v2 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT:    ds_read_b128 v[7:10], v2 offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[7:9], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_store_short v[0:1], v2, off offset:28
-; CHECK-NEXT:    global_store_byte v[0:1], v10, off offset:30
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -1805,17 +1638,11 @@ define void @memmove_p1_p3_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    ds_read_b128 v[3:6], v2
-; CHECK-NEXT:    ds_read_b32 v7, v2 offset:16
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:30
-; CHECK-NEXT:    ds_read2_b32 v[8:9], v2 offset0:5 offset1:6
-; CHECK-NEXT:    ds_read_u16 v2, v2 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT:    ds_read_b128 v[7:10], v2 offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[7:9], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_store_short v[0:1], v2, off offset:28
-; CHECK-NEXT:    global_store_byte v[0:1], v10, off offset:30
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -1855,19 +1682,16 @@ define void @memmove_p1_p4_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p4_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
-; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -1908,19 +1732,16 @@ define void @memmove_p1_p4_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p4_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
-; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -1961,21 +1782,13 @@ define void @memmove_p1_p4_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p4_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x4
-; CHECK-NEXT:    global_load_dword v8, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_dword v9, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    global_store_dword v[0:1], v8, off offset:16
-; CHECK-NEXT:    global_load_dword v8, v[2:3], off offset:20
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:15
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    global_store_byte v[0:1], v11, off offset:30
-; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:20
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -2016,22 +1829,13 @@ define void @memmove_p1_p4_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr
 ; CHECK-LABEL: memmove_p1_p4_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_dword v9, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    global_store_dword v[0:1], v9, off offset:16
-; CHECK-NEXT:    global_load_dword v9, v[2:3], off offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dword v[0:1], v9, off offset:20
-; CHECK-NEXT:    global_load_dword v2, v[2:3], off offset:24
-; CHECK-NEXT:    global_store_byte v[0:1], v8, off offset:30
-; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:15
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dword v[0:1], v2, off offset:24
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -2076,24 +1880,21 @@ define void @memmove_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    global_store_dword v[0:1], v10, off offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    global_store_short v[0:1], v11, off offset:28
-; CHECK-NEXT:    global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
 ; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[9:10], off offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -2144,24 +1945,21 @@ define void @memmove_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    global_store_dword v[0:1], v10, off offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    global_store_short v[0:1], v11, off offset:28
-; CHECK-NEXT:    global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
 ; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[9:10], off offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -2212,22 +2010,19 @@ define void @memmove_p1_p5_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p5_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    global_store_short v[0:1], v11, off offset:28
-; CHECK-NEXT:    global_store_byte v[0:1], v10, off offset:30
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[7:9], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -2278,22 +2073,19 @@ define void @memmove_p1_p5_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr
 ; CHECK-LABEL: memmove_p1_p5_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    global_store_short v[0:1], v11, off offset:28
-; CHECK-NEXT:    global_store_byte v[0:1], v10, off offset:30
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[7:9], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -2341,19 +2133,15 @@ define void @memmove_p3_p0_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[5:6], v[1:2] offset:23
+; CHECK-NEXT:    flat_load_dwordx2 v[7:8], v[1:2] offset:16
 ; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    ds_write_b8 v0, v8 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
-; CHECK-NEXT:    ds_write_b16 v0, v9 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
-; CHECK-NEXT:    ds_write_b32 v0, v7 offset:24
-; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(4)
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(2)
+; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(2)
 ; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -2398,19 +2186,15 @@ define void @memmove_p3_p0_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[5:6], v[1:2] offset:23
+; CHECK-NEXT:    flat_load_dwordx2 v[7:8], v[1:2] offset:16
 ; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    ds_write_b8 v0, v8 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
-; CHECK-NEXT:    ds_write_b16 v0, v9 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
-; CHECK-NEXT:    ds_write_b32 v0, v7 offset:24
-; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(4)
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(2)
+; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(2)
 ; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -2455,20 +2239,13 @@ define void @memmove_p3_p0_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p0_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    s_clause 0x1
 ; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2]
-; CHECK-NEXT:    flat_load_dwordx3 v[7:9], v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v1, v[1:2] offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2] offset:15
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
 ; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
-; CHECK-NEXT:    ds_write2_b32 v0, v7, v8 offset0:4 offset1:5
-; CHECK-NEXT:    ds_write_b32 v0, v9 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(4)
-; CHECK-NEXT:    ds_write_b8 v0, v10 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(4)
-; CHECK-NEXT:    ds_write_b16 v0, v1 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    ds_write_b128 v0, v[7:10] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2512,22 +2289,13 @@ define void @memmove_p3_p0_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr
 ; CHECK-LABEL: memmove_p3_p0_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_dword v8, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    ds_write_b32 v0, v8 offset:16
-; CHECK-NEXT:    flat_load_dword v8, v[1:2] offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_write_b32 v0, v8 offset:20
-; CHECK-NEXT:    flat_load_dword v1, v[1:2] offset:24
-; CHECK-NEXT:    ds_write_b8 v0, v7 offset:30
-; CHECK-NEXT:    ds_write_b16 v0, v9 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(2)
-; CHECK-NEXT:    ds_write_b32 v0, v1 offset:24
-; CHECK-NEXT:    ds_write_b128 v0, v[3:6]
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    ds_write_b128 v0, v[3:6] offset:15
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    ds_write_b128 v0, v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2571,20 +2339,16 @@ define void @memmove_p3_p1_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p1_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    s_clause 0x2
 ; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
-; CHECK-NEXT:    global_load_dwordx3 v[7:9], v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT:    global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[1:2], v[1:2], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:16
-; CHECK-NEXT:    ds_write_b32 v0, v9 offset:24
+; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT:    ds_write_b64 v0, v[1:2] offset:23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2628,20 +2392,16 @@ define void @memmove_p3_p1_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p1_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    s_clause 0x2
 ; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
-; CHECK-NEXT:    global_load_dwordx3 v[7:9], v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT:    global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[1:2], v[1:2], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:16
-; CHECK-NEXT:    ds_write_b32 v0, v9 offset:24
+; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT:    ds_write_b64 v0, v[1:2] offset:23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2685,20 +2445,13 @@ define void @memmove_p3_p1_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p1_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    s_clause 0x1
 ; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
-; CHECK-NEXT:    global_load_dwordx3 v[7:9], v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    ds_write2_b32 v0, v8, v9 offset0:5 offset1:6
-; CHECK-NEXT:    ds_write_b32 v0, v7 offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT:    ds_write_b128 v0, v[7:10] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2742,20 +2495,13 @@ define void @memmove_p3_p1_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr
 ; CHECK-LABEL: memmove_p3_p1_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_dwordx3 v[7:9], v[1:2], off offset:16
+; CHECK-NEXT:    s_clause 0x1
 ; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    ds_write2_b32 v0, v8, v9 offset0:5 offset1:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    ds_write_b128 v0, v[3:6]
-; CHECK-NEXT:    ds_write_b32 v0, v7 offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT:    ds_write_b128 v0, v[3:6]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT:    ds_write_b128 v0, v[7:10] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2799,20 +2545,14 @@ define void @memmove_p3_p3_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p3_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v7, v1 offset:30
-; CHECK-NEXT:    ds_read_u16 v8, v1 offset:28
-; CHECK-NEXT:    ds_read_b32 v9, v1 offset:24
-; CHECK-NEXT:    ds_read_b64 v[5:6], v1 offset:16
+; CHECK-NEXT:    ds_read_b64 v[5:6], v1 offset:23
+; CHECK-NEXT:    ds_read_b64 v[7:8], v1 offset:16
 ; CHECK-NEXT:    ds_read2_b64 v[1:4], v1 offset1:1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    ds_write_b8 v0, v7 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    ds_write_b16 v0, v8 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    ds_write_b32 v0, v9 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:23
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
 ; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -2856,20 +2596,14 @@ define void @memmove_p3_p3_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p3_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v7, v1 offset:30
-; CHECK-NEXT:    ds_read_u16 v8, v1 offset:28
-; CHECK-NEXT:    ds_read_b32 v9, v1 offset:24
-; CHECK-NEXT:    ds_read_b64 v[5:6], v1 offset:16
+; CHECK-NEXT:    ds_read_b64 v[5:6], v1 offset:23
+; CHECK-NEXT:    ds_read_b64 v[7:8], v1 offset:16
 ; CHECK-NEXT:    ds_read2_b64 v[1:4], v1 offset1:1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    ds_write_b8 v0, v7 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    ds_write_b16 v0, v8 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    ds_write_b32 v0, v9 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:23
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
 ; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -2914,20 +2648,11 @@ define void @memmove_p3_p3_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr add
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
-; CHECK-NEXT:    ds_read2_b32 v[6:7], v1 offset0:4 offset1:5
-; CHECK-NEXT:    ds_read_b32 v8, v1 offset:24
-; CHECK-NEXT:    ds_read_u8 v9, v1 offset:30
-; CHECK-NEXT:    ds_read_u16 v1, v1 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
+; CHECK-NEXT:    ds_read_b128 v[6:9], v1 offset:15
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
 ; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    ds_write2_b32 v0, v6, v7 offset0:4 offset1:5
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    ds_write_b32 v0, v8 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    ds_write_b8 v0, v9 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    ds_write_b16 v0, v1 offset:28
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    ds_write_b128 v0, v[6:9] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2970,21 +2695,12 @@ define void @memmove_p3_p3_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr
 ; CHECK-LABEL: memmove_p3_p3_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read2_b32 v[5:6], v1 offset0:4 offset1:5
-; CHECK-NEXT:    ds_read_b32 v7, v1 offset:24
-; CHECK-NEXT:    ds_read_u8 v8, v1 offset:30
-; CHECK-NEXT:    ds_read_u16 v9, v1 offset:28
-; CHECK-NEXT:    ds_read_b128 v[1:4], v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    ds_write2_b32 v0, v5, v6 offset0:4 offset1:5
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    ds_write_b32 v0, v7 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    ds_write_b8 v0, v8 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    ds_write_b16 v0, v9 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    ds_write_b128 v0, v[1:4]
+; CHECK-NEXT:    ds_read_b128 v[2:5], v1 offset:15
+; CHECK-NEXT:    ds_read_b128 v[6:9], v1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    ds_write_b128 v0, v[2:5] offset:15
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    ds_write_b128 v0, v[6:9]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -3027,20 +2743,16 @@ define void @memmove_p3_p4_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p4_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    s_clause 0x2
 ; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
-; CHECK-NEXT:    global_load_dwordx3 v[7:9], v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT:    global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[1:2], v[1:2], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:16
-; CHECK-NEXT:    ds_write_b32 v0, v9 offset:24
+; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT:    ds_write_b64 v0, v[1:2] offset:23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -3084,20 +2796,16 @@ define void @memmove_p3_p4_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p4_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    s_clause 0x2
 ; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
-; CHECK-NEXT:    global_load_dwordx3 v[7:9], v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT:    global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[1:2], v[1:2], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:16
-; CHECK-NEXT:    ds_write_b32 v0, v9 offset:24
+; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT:    ds_write_b64 v0, v[1:2] offset:23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -3141,20 +2849,13 @@ define void @memmove_p3_p4_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p4_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    s_clause 0x1
 ; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
-; CHECK-NEXT:    global_load_dwordx3 v[7:9], v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    ds_write2_b32 v0, v8, v9 offset0:5 offset1:6
-; CHECK-NEXT:    ds_write_b32 v0, v7 offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT:    ds_write_b128 v0, v[7:10] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -3198,20 +2899,13 @@ define void @memmove_p3_p4_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr
 ; CHECK-LABEL: memmove_p3_p4_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_dwordx3 v[7:9], v[1:2], off offset:16
+; CHECK-NEXT:    s_clause 0x1
 ; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    ds_write2_b32 v0, v8, v9 offset0:5 offset1:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    ds_write_b128 v0, v[3:6]
-; CHECK-NEXT:    ds_write_b32 v0, v7 offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT:    ds_write_b128 v0, v[3:6]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT:    ds_write_b128 v0, v[7:10] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -3259,25 +2953,21 @@ define void @memmove_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    ds_write_b32 v0, v9 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    ds_write_b16 v0, v10 offset:28
-; CHECK-NEXT:    ds_write_b8 v0, v8 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
 ; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
 ; CHECK-NEXT:    ds_write_b64 v0, v[6:7] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write_b64 v0, v[8:9] offset:23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -3331,25 +3021,21 @@ define void @memmove_p3_p5_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    ds_write_b32 v0, v9 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    ds_write_b16 v0, v10 offset:28
-; CHECK-NEXT:    ds_write_b8 v0, v8 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
 ; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
 ; CHECK-NEXT:    ds_write_b64 v0, v[6:7] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write_b64 v0, v[8:9] offset:23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -3403,26 +3089,19 @@ define void @memmove_p3_p5_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p5_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    ds_write2_b32 v0, v6, v7 offset0:5 offset1:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    ds_write_b32 v0, v8 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    ds_write_b16 v0, v9 offset:28
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    ds_write2_b64 v0, v[6:7], v[8:9] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT:    ds_write_b128 v0, v[2:5] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -3476,24 +3155,19 @@ define void @memmove_p3_p5_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr
 ; CHECK-LABEL: memmove_p3_p5_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    ds_write2_b32 v0, v8, v9 offset0:5 offset1:6
-; CHECK-NEXT:    ds_write_b32 v0, v7 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    ds_write_b16 v0, v10 offset:28
-; CHECK-NEXT:    ds_write_b8 v0, v6 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    ds_write_b128 v0, v[2:5]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write_b128 v0, v[6:9] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -3545,19 +3219,16 @@ define void @memmove_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[5:6], v[1:2] offset:23
+; CHECK-NEXT:    flat_load_dwordx2 v[7:8], v[1:2] offset:16
 ; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
@@ -3612,19 +3283,16 @@ define void @memmove_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[5:6], v[1:2] offset:23
+; CHECK-NEXT:    flat_load_dwordx2 v[7:8], v[1:2] offset:16
 ; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
@@ -3679,28 +3347,19 @@ define void @memmove_p5_p0_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p0_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x4
-; CHECK-NEXT:    flat_load_dword v7, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_dword v8, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    flat_load_dword v1, v[1:2] offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -3750,25 +3409,19 @@ define void @memmove_p5_p0_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr
 ; CHECK-LABEL: memmove_p5_p0_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_dword v8, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    flat_load_dword v8, v[1:2] offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    flat_load_dword v1, v[1:2] offset:24
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -3818,24 +3471,21 @@ define void @memmove_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p1_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[1:2], v[1:2], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -3885,24 +3535,21 @@ define void @memmove_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p1_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[1:2], v[1:2], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -3952,24 +3599,19 @@ define void @memmove_p5_p1_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p1_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -4019,24 +3661,19 @@ define void @memmove_p5_p1_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr
 ; CHECK-LABEL: memmove_p5_p1_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -4086,25 +3723,20 @@ define void @memmove_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p3_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_b32 v8, v1 offset:24
-; CHECK-NEXT:    ds_read_u16 v9, v1 offset:28
-; CHECK-NEXT:    ds_read_u8 v10, v1 offset:30
 ; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
 ; CHECK-NEXT:    ds_read_b64 v[6:7], v1 offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    ds_read_b64 v[8:9], v1 offset:23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
 ; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -4153,25 +3785,20 @@ define void @memmove_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p3_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_b32 v8, v1 offset:24
-; CHECK-NEXT:    ds_read_u16 v9, v1 offset:28
-; CHECK-NEXT:    ds_read_u8 v10, v1 offset:30
 ; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
 ; CHECK-NEXT:    ds_read_b64 v[6:7], v1 offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    ds_read_b64 v[8:9], v1 offset:23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
 ; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -4220,25 +3847,18 @@ define void @memmove_p5_p3_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p3_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read2_b32 v[5:6], v1 offset0:4 offset1:5
-; CHECK-NEXT:    ds_read_b32 v7, v1 offset:24
-; CHECK-NEXT:    ds_read_u16 v8, v1 offset:28
-; CHECK-NEXT:    ds_read_u8 v9, v1 offset:30
-; CHECK-NEXT:    ds_read2_b64 v[1:4], v1 offset1:1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read_b128 v[6:9], v1 offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -4287,25 +3907,18 @@ define void @memmove_p5_p3_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr
 ; CHECK-LABEL: memmove_p5_p3_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read2_b32 v[5:6], v1 offset0:4 offset1:5
-; CHECK-NEXT:    ds_read_b32 v7, v1 offset:24
-; CHECK-NEXT:    ds_read_u16 v8, v1 offset:28
-; CHECK-NEXT:    ds_read_u8 v9, v1 offset:30
-; CHECK-NEXT:    ds_read_b128 v[1:4], v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    ds_read_b128 v[2:5], v1
+; CHECK-NEXT:    ds_read_b128 v[6:9], v1 offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -4354,24 +3967,21 @@ define void @memmove_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p4_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[1:2], v[1:2], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -4421,24 +4031,21 @@ define void @memmove_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p4_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[1:2], v[1:2], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -4488,24 +4095,19 @@ define void @memmove_p5_p4_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p4_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -4555,24 +4157,19 @@ define void @memmove_p5_p4_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr
 ; CHECK-LABEL: memmove_p5_p4_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -4629,34 +4226,31 @@ define void @memmove_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -4725,34 +4319,31 @@ define void @memmove_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -4821,32 +4412,29 @@ define void @memmove_p5_p5_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p5_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:19
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -4917,32 +4505,29 @@ define void @memmove_p5_p5_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr
 ; CHECK-LABEL: memmove_p5_p5_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:19
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll b/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll
index 8fdecfac10927..0ded4604d4179 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll
@@ -27,22 +27,16 @@ define void @memmove_p1_p4_sz31_align_4_4(ptr addrspace(1) align 4 %dst, ptr add
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    s_load_dwordx8 s[4:11], s[16:17], 0x0
-; CHECK-NEXT:    global_load_ubyte v9, v2, s[16:17] offset:30
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[16:17], 0x0
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v2, s[16:17] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v2, s4
-; CHECK-NEXT:    v_mov_b32_e32 v3, s5
-; CHECK-NEXT:    v_mov_b32_e32 v4, s6
-; CHECK-NEXT:    v_mov_b32_e32 v5, s7
-; CHECK-NEXT:    v_mov_b32_e32 v10, s11
-; CHECK-NEXT:    v_mov_b32_e32 v6, s8
-; CHECK-NEXT:    v_mov_b32_e32 v7, s9
-; CHECK-NEXT:    v_mov_b32_e32 v8, s10
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT:    v_mov_b32_e32 v9, s7
+; CHECK-NEXT:    v_mov_b32_e32 v8, s6
+; CHECK-NEXT:    v_mov_b32_e32 v7, s5
+; CHECK-NEXT:    v_mov_b32_e32 v6, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_byte v[0:1], v9, off offset:30
-; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:15
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 4 %dst, ptr addrspace(4) noundef nonnull align 4 %src, i64 31, i1 false)
diff --git a/llvm/test/CodeGen/RISCV/memmove.ll b/llvm/test/CodeGen/RISCV/memmove.ll
index 1fffe359389b0..89c12c4fa9612 100644
--- a/llvm/test/CodeGen/RISCV/memmove.ll
+++ b/llvm/test/CodeGen/RISCV/memmove.ll
@@ -195,22 +195,18 @@ define void @unaligned_memmove7(ptr nocapture %dest, ptr %src) nounwind {
 ;
 ; RV32-FAST-LABEL: unaligned_memmove7:
 ; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lw a2, 0(a1)
-; RV32-FAST-NEXT:    lh a3, 4(a1)
-; RV32-FAST-NEXT:    lbu a1, 6(a1)
-; RV32-FAST-NEXT:    sw a2, 0(a0)
-; RV32-FAST-NEXT:    sh a3, 4(a0)
-; RV32-FAST-NEXT:    sb a1, 6(a0)
+; RV32-FAST-NEXT:    lw a2, 3(a1)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a2, 3(a0)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
 ; RV32-FAST-NEXT:    ret
 ;
 ; RV64-FAST-LABEL: unaligned_memmove7:
 ; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    lw a2, 0(a1)
-; RV64-FAST-NEXT:    lh a3, 4(a1)
-; RV64-FAST-NEXT:    lbu a1, 6(a1)
-; RV64-FAST-NEXT:    sw a2, 0(a0)
-; RV64-FAST-NEXT:    sh a3, 4(a0)
-; RV64-FAST-NEXT:    sb a1, 6(a0)
+; RV64-FAST-NEXT:    lw a2, 3(a1)
+; RV64-FAST-NEXT:    lw a1, 0(a1)
+; RV64-FAST-NEXT:    sw a2, 3(a0)
+; RV64-FAST-NEXT:    sw a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
   tail call void @llvm.memmove.p0.p0.i64(ptr %dest, ptr %src, i64 7, i1 false)
@@ -289,28 +285,22 @@ define void @unaligned_memmove15(ptr nocapture %dest, ptr %src) nounwind {
 ;
 ; RV32-FAST-LABEL: unaligned_memmove15:
 ; RV32-FAST:       # %bb.0: # %entry
-; RV32-FAST-NEXT:    lbu a2, 14(a1)
+; RV32-FAST-NEXT:    lw a2, 11(a1)
 ; RV32-FAST-NEXT:    lw a3, 0(a1)
 ; RV32-FAST-NEXT:    lw a4, 4(a1)
-; RV32-FAST-NEXT:    lw a5, 8(a1)
-; RV32-FAST-NEXT:    lh a1, 12(a1)
-; RV32-FAST-NEXT:    sb a2, 14(a0)
+; RV32-FAST-NEXT:    lw a1, 8(a1)
+; RV32-FAST-NEXT:    sw a2, 11(a0)
 ; RV32-FAST-NEXT:    sw a3, 0(a0)
 ; RV32-FAST-NEXT:    sw a4, 4(a0)
-; RV32-FAST-NEXT:    sw a5, 8(a0)
-; RV32-FAST-NEXT:    sh a1, 12(a0)
+; RV32-FAST-NEXT:    sw a1, 8(a0)
 ; RV32-FAST-NEXT:    ret
 ;
 ; RV64-FAST-LABEL: unaligned_memmove15:
 ; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    ld a2, 0(a1)
-; RV64-FAST-NEXT:    lw a3, 8(a1)
-; RV64-FAST-NEXT:    lh a4, 12(a1)
-; RV64-FAST-NEXT:    lbu a1, 14(a1)
-; RV64-FAST-NEXT:    sd a2, 0(a0)
-; RV64-FAST-NEXT:    sw a3, 8(a0)
-; RV64-FAST-NEXT:    sh a4, 12(a0)
-; RV64-FAST-NEXT:    sb a1, 14(a0)
+; RV64-FAST-NEXT:    ld a2, 7(a1)
+; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sd a2, 7(a0)
+; RV64-FAST-NEXT:    sd a1, 0(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
   tail call void @llvm.memmove.p0.p0.i64(ptr %dest, ptr %src, i64 15, i1 false)
@@ -353,30 +343,46 @@ entry:
 }
 
 define void @unaligned_memmove31(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-BOTH-LABEL: unaligned_memmove31:
-; RV32-BOTH:       # %bb.0: # %entry
-; RV32-BOTH-NEXT:    li a2, 31
-; RV32-BOTH-NEXT:    tail memmove
+; RV32-LABEL: unaligned_memmove31:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a2, 31
+; RV32-NEXT:    tail memmove
 ;
 ; RV64-LABEL: unaligned_memmove31:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    li a2, 31
 ; RV64-NEXT:    tail memmove
 ;
+; RV32-FAST-LABEL: unaligned_memmove31:
+; RV32-FAST:       # %bb.0: # %entry
+; RV32-FAST-NEXT:    lw a2, 16(a1)
+; RV32-FAST-NEXT:    lw a3, 20(a1)
+; RV32-FAST-NEXT:    lw a4, 24(a1)
+; RV32-FAST-NEXT:    lw a5, 27(a1)
+; RV32-FAST-NEXT:    lw a6, 0(a1)
+; RV32-FAST-NEXT:    lw a7, 4(a1)
+; RV32-FAST-NEXT:    lw t0, 8(a1)
+; RV32-FAST-NEXT:    lw a1, 12(a1)
+; RV32-FAST-NEXT:    sw a5, 27(a0)
+; RV32-FAST-NEXT:    sw a2, 16(a0)
+; RV32-FAST-NEXT:    sw a3, 20(a0)
+; RV32-FAST-NEXT:    sw a4, 24(a0)
+; RV32-FAST-NEXT:    sw a6, 0(a0)
+; RV32-FAST-NEXT:    sw a7, 4(a0)
+; RV32-FAST-NEXT:    sw t0, 8(a0)
+; RV32-FAST-NEXT:    sw a1, 12(a0)
+; RV32-FAST-NEXT:    ret
+;
 ; RV64-FAST-LABEL: unaligned_memmove31:
 ; RV64-FAST:       # %bb.0: # %entry
-; RV64-FAST-NEXT:    lh a2, 28(a1)
-; RV64-FAST-NEXT:    lbu a3, 30(a1)
-; RV64-FAST-NEXT:    ld a4, 0(a1)
-; RV64-FAST-NEXT:    ld a5, 8(a1)
-; RV64-FAST-NEXT:    ld a6, 16(a1)
-; RV64-FAST-NEXT:    lw a1, 24(a1)
-; RV64-FAST-NEXT:    sh a2, 28(a0)
-; RV64-FAST-NEXT:    sb a3, 30(a0)
-; RV64-FAST-NEXT:    sd a4, 0(a0)
-; RV64-FAST-NEXT:    sd a5, 8(a0)
-; RV64-FAST-NEXT:    sd a6, 16(a0)
-; RV64-FAST-NEXT:    sw a1, 24(a0)
+; RV64-FAST-NEXT:    ld a2, 23(a1)
+; RV64-FAST-NEXT:    ld a3, 0(a1)
+; RV64-FAST-NEXT:    ld a4, 8(a1)
+; RV64-FAST-NEXT:    ld a1, 16(a1)
+; RV64-FAST-NEXT:    sd a2, 23(a0)
+; RV64-FAST-NEXT:    sd a3, 0(a0)
+; RV64-FAST-NEXT:    sd a4, 8(a0)
+; RV64-FAST-NEXT:    sd a1, 16(a0)
 ; RV64-FAST-NEXT:    ret
 entry:
   tail call void @llvm.memmove.p0.p0.i64(ptr %dest, ptr %src, i64 31, i1 false)
@@ -472,25 +478,41 @@ entry:
 }
 
 define void @aligned_memmove7(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-BOTH-LABEL: aligned_memmove7:
-; RV32-BOTH:       # %bb.0: # %entry
-; RV32-BOTH-NEXT:    lw a2, 0(a1)
-; RV32-BOTH-NEXT:    lh a3, 4(a1)
-; RV32-BOTH-NEXT:    lbu a1, 6(a1)
-; RV32-BOTH-NEXT:    sw a2, 0(a0)
-; RV32-BOTH-NEXT:    sh a3, 4(a0)
-; RV32-BOTH-NEXT:    sb a1, 6(a0)
-; RV32-BOTH-NEXT:    ret
+; RV32-LABEL: aligned_memmove7:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    lw a2, 0(a1)
+; RV32-NEXT:    lh a3, 4(a1)
+; RV32-NEXT:    lbu a1, 6(a1)
+; RV32-NEXT:    sw a2, 0(a0)
+; RV32-NEXT:    sh a3, 4(a0)
+; RV32-NEXT:    sb a1, 6(a0)
+; RV32-NEXT:    ret
 ;
-; RV64-BOTH-LABEL: aligned_memmove7:
-; RV64-BOTH:       # %bb.0: # %entry
-; RV64-BOTH-NEXT:    lw a2, 0(a1)
-; RV64-BOTH-NEXT:    lh a3, 4(a1)
-; RV64-BOTH-NEXT:    lbu a1, 6(a1)
-; RV64-BOTH-NEXT:    sw a2, 0(a0)
-; RV64-BOTH-NEXT:    sh a3, 4(a0)
-; RV64-BOTH-NEXT:    sb a1, 6(a0)
-; RV64-BOTH-NEXT:    ret
+; RV64-LABEL: aligned_memmove7:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    lw a2, 0(a1)
+; RV64-NEXT:    lh a3, 4(a1)
+; RV64-NEXT:    lbu a1, 6(a1)
+; RV64-NEXT:    sw a2, 0(a0)
+; RV64-NEXT:    sh a3, 4(a0)
+; RV64-NEXT:    sb a1, 6(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: aligned_memmove7:
+; RV32-FAST:       # %bb.0: # %entry
+; RV32-FAST-NEXT:    lw a2, 3(a1)
+; RV32-FAST-NEXT:    lw a1, 0(a1)
+; RV32-FAST-NEXT:    sw a2, 3(a0)
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: aligned_memmove7:
+; RV64-FAST:       # %bb.0: # %entry
+; RV64-FAST-NEXT:    lw a2, 3(a1)
+; RV64-FAST-NEXT:    lw a1, 0(a1)
+; RV64-FAST-NEXT:    sw a2, 3(a0)
+; RV64-FAST-NEXT:    sw a1, 0(a0)
+; RV64-FAST-NEXT:    ret
 entry:
   tail call void @llvm.memmove.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 7, i1 false)
   ret void
@@ -516,31 +538,51 @@ entry:
 }
 
 define void @aligned_memmove15(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-BOTH-LABEL: aligned_memmove15:
-; RV32-BOTH:       # %bb.0: # %entry
-; RV32-BOTH-NEXT:    lw a2, 0(a1)
-; RV32-BOTH-NEXT:    lw a3, 8(a1)
-; RV32-BOTH-NEXT:    lh a4, 12(a1)
-; RV32-BOTH-NEXT:    lbu a5, 14(a1)
-; RV32-BOTH-NEXT:    sw a2, 0(a0)
-; RV32-BOTH-NEXT:    lw a1, 4(a1)
-; RV32-BOTH-NEXT:    sw a1, 4(a0)
-; RV32-BOTH-NEXT:    sw a3, 8(a0)
-; RV32-BOTH-NEXT:    sh a4, 12(a0)
-; RV32-BOTH-NEXT:    sb a5, 14(a0)
-; RV32-BOTH-NEXT:    ret
+; RV32-LABEL: aligned_memmove15:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    lw a2, 0(a1)
+; RV32-NEXT:    lw a3, 8(a1)
+; RV32-NEXT:    lh a4, 12(a1)
+; RV32-NEXT:    lbu a5, 14(a1)
+; RV32-NEXT:    sw a2, 0(a0)
+; RV32-NEXT:    lw a1, 4(a1)
+; RV32-NEXT:    sw a1, 4(a0)
+; RV32-NEXT:    sw a3, 8(a0)
+; RV32-NEXT:    sh a4, 12(a0)
+; RV32-NEXT:    sb a5, 14(a0)
+; RV32-NEXT:    ret
 ;
-; RV64-BOTH-LABEL: aligned_memmove15:
-; RV64-BOTH:       # %bb.0: # %entry
-; RV64-BOTH-NEXT:    ld a2, 0(a1)
-; RV64-BOTH-NEXT:    lw a3, 8(a1)
-; RV64-BOTH-NEXT:    lh a4, 12(a1)
-; RV64-BOTH-NEXT:    lbu a1, 14(a1)
-; RV64-BOTH-NEXT:    sd a2, 0(a0)
-; RV64-BOTH-NEXT:    sw a3, 8(a0)
-; RV64-BOTH-NEXT:    sh a4, 12(a0)
-; RV64-BOTH-NEXT:    sb a1, 14(a0)
-; RV64-BOTH-NEXT:    ret
+; RV64-LABEL: aligned_memmove15:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    ld a2, 0(a1)
+; RV64-NEXT:    lw a3, 8(a1)
+; RV64-NEXT:    lh a4, 12(a1)
+; RV64-NEXT:    lbu a1, 14(a1)
+; RV64-NEXT:    sd a2, 0(a0)
+; RV64-NEXT:    sw a3, 8(a0)
+; RV64-NEXT:    sh a4, 12(a0)
+; RV64-NEXT:    sb a1, 14(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: aligned_memmove15:
+; RV32-FAST:       # %bb.0: # %entry
+; RV32-FAST-NEXT:    lw a2, 0(a1)
+; RV32-FAST-NEXT:    lw a3, 8(a1)
+; RV32-FAST-NEXT:    lw a4, 11(a1)
+; RV32-FAST-NEXT:    sw a2, 0(a0)
+; RV32-FAST-NEXT:    lw a1, 4(a1)
+; RV32-FAST-NEXT:    sw a1, 4(a0)
+; RV32-FAST-NEXT:    sw a3, 8(a0)
+; RV32-FAST-NEXT:    sw a4, 11(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: aligned_memmove15:
+; RV64-FAST:       # %bb.0: # %entry
+; RV64-FAST-NEXT:    ld a2, 7(a1)
+; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sd a2, 7(a0)
+; RV64-FAST-NEXT:    sd a1, 0(a0)
+; RV64-FAST-NEXT:    ret
 entry:
   tail call void @llvm.memmove.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 15, i1 false)
   ret void
@@ -572,26 +614,58 @@ entry:
 }
 
 define void @aligned_memmove31(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-BOTH-LABEL: aligned_memmove31:
-; RV32-BOTH:       # %bb.0: # %entry
-; RV32-BOTH-NEXT:    li a2, 31
-; RV32-BOTH-NEXT:    tail memmove
+; RV32-LABEL: aligned_memmove31:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a2, 31
+; RV32-NEXT:    tail memmove
 ;
-; RV64-BOTH-LABEL: aligned_memmove31:
-; RV64-BOTH:       # %bb.0: # %entry
-; RV64-BOTH-NEXT:    lh a2, 28(a1)
-; RV64-BOTH-NEXT:    lbu a3, 30(a1)
-; RV64-BOTH-NEXT:    ld a4, 0(a1)
-; RV64-BOTH-NEXT:    ld a5, 8(a1)
-; RV64-BOTH-NEXT:    ld a6, 16(a1)
-; RV64-BOTH-NEXT:    lw a1, 24(a1)
-; RV64-BOTH-NEXT:    sh a2, 28(a0)
-; RV64-BOTH-NEXT:    sb a3, 30(a0)
-; RV64-BOTH-NEXT:    sd a4, 0(a0)
-; RV64-BOTH-NEXT:    sd a5, 8(a0)
-; RV64-BOTH-NEXT:    sd a6, 16(a0)
-; RV64-BOTH-NEXT:    sw a1, 24(a0)
-; RV64-BOTH-NEXT:    ret
+; RV64-LABEL: aligned_memmove31:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    lh a2, 28(a1)
+; RV64-NEXT:    lbu a3, 30(a1)
+; RV64-NEXT:    ld a4, 0(a1)
+; RV64-NEXT:    ld a5, 8(a1)
+; RV64-NEXT:    ld a6, 16(a1)
+; RV64-NEXT:    lw a1, 24(a1)
+; RV64-NEXT:    sh a2, 28(a0)
+; RV64-NEXT:    sb a3, 30(a0)
+; RV64-NEXT:    sd a4, 0(a0)
+; RV64-NEXT:    sd a5, 8(a0)
+; RV64-NEXT:    sd a6, 16(a0)
+; RV64-NEXT:    sw a1, 24(a0)
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: aligned_memmove31:
+; RV32-FAST:       # %bb.0: # %entry
+; RV32-FAST-NEXT:    lw a2, 27(a1)
+; RV32-FAST-NEXT:    lw a3, 0(a1)
+; RV32-FAST-NEXT:    lw a4, 8(a1)
+; RV32-FAST-NEXT:    lw a5, 16(a1)
+; RV32-FAST-NEXT:    lw a6, 24(a1)
+; RV32-FAST-NEXT:    sw a3, 0(a0)
+; RV32-FAST-NEXT:    lw a3, 4(a1)
+; RV32-FAST-NEXT:    lw a7, 12(a1)
+; RV32-FAST-NEXT:    lw a1, 20(a1)
+; RV32-FAST-NEXT:    sw a3, 4(a0)
+; RV32-FAST-NEXT:    sw a4, 8(a0)
+; RV32-FAST-NEXT:    sw a7, 12(a0)
+; RV32-FAST-NEXT:    sw a5, 16(a0)
+; RV32-FAST-NEXT:    sw a1, 20(a0)
+; RV32-FAST-NEXT:    sw a6, 24(a0)
+; RV32-FAST-NEXT:    sw a2, 27(a0)
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: aligned_memmove31:
+; RV64-FAST:       # %bb.0: # %entry
+; RV64-FAST-NEXT:    ld a2, 23(a1)
+; RV64-FAST-NEXT:    ld a3, 0(a1)
+; RV64-FAST-NEXT:    ld a4, 8(a1)
+; RV64-FAST-NEXT:    ld a1, 16(a1)
+; RV64-FAST-NEXT:    sd a2, 23(a0)
+; RV64-FAST-NEXT:    sd a3, 0(a0)
+; RV64-FAST-NEXT:    sd a4, 8(a0)
+; RV64-FAST-NEXT:    sd a1, 16(a0)
+; RV64-FAST-NEXT:    ret
 entry:
   tail call void @llvm.memmove.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 31, i1 false)
   ret void
@@ -638,29 +712,49 @@ entry:
 }
 
 define i32 @memmove11_align8(ptr nocapture %dest, ptr %src) {
-; RV32-BOTH-LABEL: memmove11_align8:
-; RV32-BOTH:       # %bb.0: # %entry
-; RV32-BOTH-NEXT:    lw a2, 0(a1)
-; RV32-BOTH-NEXT:    lh a3, 8(a1)
-; RV32-BOTH-NEXT:    lbu a4, 10(a1)
-; RV32-BOTH-NEXT:    sw a2, 0(a0)
-; RV32-BOTH-NEXT:    lw a1, 4(a1)
-; RV32-BOTH-NEXT:    sw a1, 4(a0)
-; RV32-BOTH-NEXT:    sh a3, 8(a0)
-; RV32-BOTH-NEXT:    sb a4, 10(a0)
-; RV32-BOTH-NEXT:    li a0, 0
-; RV32-BOTH-NEXT:    ret
+; RV32-LABEL: memmove11_align8:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    lw a2, 0(a1)
+; RV32-NEXT:    lh a3, 8(a1)
+; RV32-NEXT:    lbu a4, 10(a1)
+; RV32-NEXT:    sw a2, 0(a0)
+; RV32-NEXT:    lw a1, 4(a1)
+; RV32-NEXT:    sw a1, 4(a0)
+; RV32-NEXT:    sh a3, 8(a0)
+; RV32-NEXT:    sb a4, 10(a0)
+; RV32-NEXT:    li a0, 0
+; RV32-NEXT:    ret
 ;
-; RV64-BOTH-LABEL: memmove11_align8:
-; RV64-BOTH:       # %bb.0: # %entry
-; RV64-BOTH-NEXT:    ld a2, 0(a1)
-; RV64-BOTH-NEXT:    lh a3, 8(a1)
-; RV64-BOTH-NEXT:    lbu a1, 10(a1)
-; RV64-BOTH-NEXT:    sd a2, 0(a0)
-; RV64-BOTH-NEXT:    sh a3, 8(a0)
-; RV64-BOTH-NEXT:    sb a1, 10(a0)
-; RV64-BOTH-NEXT:    li a0, 0
-; RV64-BOTH-NEXT:    ret
+; RV64-LABEL: memmove11_align8:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    ld a2, 0(a1)
+; RV64-NEXT:    lh a3, 8(a1)
+; RV64-NEXT:    lbu a1, 10(a1)
+; RV64-NEXT:    sd a2, 0(a0)
+; RV64-NEXT:    sh a3, 8(a0)
+; RV64-NEXT:    sb a1, 10(a0)
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+;
+; RV32-FAST-LABEL: memmove11_align8:
+; RV32-FAST:       # %bb.0: # %entry
+; RV32-FAST-NEXT:    lw a2, 0(a1)
+; RV32-FAST-NEXT:    lw a3, 7(a1)
+; RV32-FAST-NEXT:    sw a2, 0(a0)
+; RV32-FAST-NEXT:    lw a1, 4(a1)
+; RV32-FAST-NEXT:    sw a1, 4(a0)
+; RV32-FAST-NEXT:    sw a3, 7(a0)
+; RV32-FAST-NEXT:    li a0, 0
+; RV32-FAST-NEXT:    ret
+;
+; RV64-FAST-LABEL: memmove11_align8:
+; RV64-FAST:       # %bb.0: # %entry
+; RV64-FAST-NEXT:    lw a2, 7(a1)
+; RV64-FAST-NEXT:    ld a1, 0(a1)
+; RV64-FAST-NEXT:    sw a2, 7(a0)
+; RV64-FAST-NEXT:    sd a1, 0(a0)
+; RV64-FAST-NEXT:    li a0, 0
+; RV64-FAST-NEXT:    ret
 entry:
   call void @llvm.memmove.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 11, i1 false)
   ret i32 0
diff --git a/llvm/test/CodeGen/X86/volatile-memstores-nooverlapping-load-stores.ll b/llvm/test/CodeGen/X86/volatile-memstores-nooverlapping-load-stores.ll
index dd61ec629c2f0..bb1a9500a4c94 100644
--- a/llvm/test/CodeGen/X86/volatile-memstores-nooverlapping-load-stores.ll
+++ b/llvm/test/CodeGen/X86/volatile-memstores-nooverlapping-load-stores.ll
@@ -34,10 +34,8 @@ define dso_local void @move_7_bytes(ptr nocapture, ptr nocapture readonly) nounw
 ; CHECK-LABEL: move_7_bytes:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl (%rsi), %eax
-; CHECK-NEXT:    movzwl 4(%rsi), %ecx
-; CHECK-NEXT:    movzbl 6(%rsi), %edx
-; CHECK-NEXT:    movb %dl, 6(%rdi)
-; CHECK-NEXT:    movw %cx, 4(%rdi)
+; CHECK-NEXT:    movl 3(%rsi), %ecx
+; CHECK-NEXT:    movl %ecx, 3(%rdi)
 ; CHECK-NEXT:    movl %eax, (%rdi)
 ; CHECK-NEXT:    retq
   tail call void @llvm.memmove.p0.p0.i64(ptr align 1 %0, ptr align 1 %1, i64 7, i1 false)



More information about the llvm-commits mailing list