[llvm] Enable generic overlapping optimization for memmove (PR #177885)
Osama Abdelkader via llvm-commits
llvm-commits at lists.llvm.org
Sun Jan 25 16:10:07 PST 2026
https://github.com/osamakader updated https://github.com/llvm/llvm-project/pull/177885
>From 2635442658975051a1332bc2a3e5911c7f27029c Mon Sep 17 00:00:00 2001
From: Osama Abdelkader <osama.abdelkader at gmail.com>
Date: Sun, 25 Jan 2026 22:40:15 +0100
Subject: [PATCH 1/2] Enable generic overlapping optimization for memmove
This change enables memmove to use the same generic overlapping load/store
optimization that memcpy uses, instead of requiring target-specific code.
Changes:
1. Use isVol instead of hardcoding IsVolatile=true in getMemmoveLoadsAndStores.
This allows allowOverlap()=true for non-volatile memmove, enabling
findOptimalMemOpLowering to generate overlapping MemOps.
2. Add overlapping load/store handling to memmove, matching memcpy's
implementation. This adjusts offsets when the last MemOp is larger than
the remaining size, creating overlapping loads/stores for non-power-of-two
sizes.
Fixes: #165948
Signed-off-by: Osama Abdelkader <osama.abdelkader at gmail.com>
---
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 16 ++-
llvm/test/CodeGen/AArch64/memmove-inline.ll | 98 +++++++++++++++++++
2 files changed, 112 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 4ca1bb053fce5..cf7eb82ba5ed0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8966,8 +8966,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemmove(OptSize);
if (!TLI.findOptimalMemOpLowering(
C, MemOps, Limit,
- MemOp::Copy(Size, DstAlignCanChange, Alignment, *SrcAlign,
- /*IsVolatile*/ true),
+ MemOp::Copy(Size, DstAlignCanChange, Alignment, *SrcAlign, isVol),
DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
MF.getFunction().getAttributes()))
return SDValue();
@@ -9008,6 +9007,12 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
unsigned VTSize = VT.getSizeInBits() / 8;
SDValue Value;
+ if (i == NumMemOps - 1 && i != 0 && VTSize > Size - SrcOff) {
+ // Issuing an unaligned load / store pair that overlaps with the previous
+ // pair. Adjust the offset accordingly.
+ SrcOff -= VTSize - (Size - SrcOff);
+ }
+
bool isDereferenceable =
SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
@@ -9024,11 +9029,18 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
}
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
OutChains.clear();
+ DstOff = 0;
for (unsigned i = 0; i < NumMemOps; i++) {
EVT VT = MemOps[i];
unsigned VTSize = VT.getSizeInBits() / 8;
SDValue Store;
+ if (i == NumMemOps - 1 && i != 0 && VTSize > Size - DstOff) {
+ // Issuing an unaligned load / store pair that overlaps with the previous
+ // pair. Adjust the offset accordingly.
+ DstOff -= VTSize - (Size - DstOff);
+ }
+
Store = DAG.getStore(
Chain, dl, LoadValues[i],
DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
diff --git a/llvm/test/CodeGen/AArch64/memmove-inline.ll b/llvm/test/CodeGen/AArch64/memmove-inline.ll
index 641c48dd0f1c5..4ca180616442d 100644
--- a/llvm/test/CodeGen/AArch64/memmove-inline.ll
+++ b/llvm/test/CodeGen/AArch64/memmove-inline.ll
@@ -120,3 +120,101 @@ entry:
}
declare void @llvm.memmove.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1)
+
+; Test overlapping memmove optimization for non-power-of-two sizes
+; These should use overlapping loads/stores instead of mixed-size operations
+
+define void @move7(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move7:
+; CHECK-ALIGNED: // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT: ldur w8, [x1, #3]
+; CHECK-ALIGNED-NEXT: ldr w9, [x1]
+; CHECK-ALIGNED-NEXT: stur w8, [x0, #3]
+; CHECK-ALIGNED-NEXT: str w9, [x0]
+; CHECK-ALIGNED-NEXT: ret
+entry:
+ call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 7, i1 false)
+ ret void
+}
+
+define void @move13(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move13:
+; CHECK-ALIGNED: // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT: ldur x8, [x1, #5]
+; CHECK-ALIGNED-NEXT: ldr x9, [x1]
+; CHECK-ALIGNED-NEXT: stur x8, [x0, #5]
+; CHECK-ALIGNED-NEXT: str x9, [x0]
+; CHECK-ALIGNED-NEXT: ret
+entry:
+ call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 13, i1 false)
+ ret void
+}
+
+define void @move15(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move15:
+; CHECK-ALIGNED: // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT: ldur x8, [x1, #7]
+; CHECK-ALIGNED-NEXT: ldr x9, [x1]
+; CHECK-ALIGNED-NEXT: stur x8, [x0, #7]
+; CHECK-ALIGNED-NEXT: str x9, [x0]
+; CHECK-ALIGNED-NEXT: ret
+entry:
+ call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 15, i1 false)
+ ret void
+}
+
+define void @move25(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move25:
+; CHECK-ALIGNED: // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT: ldur q0, [x1, #9]
+; CHECK-ALIGNED-NEXT: ldr q1, [x1]
+; CHECK-ALIGNED-NEXT: stur q0, [x0, #9]
+; CHECK-ALIGNED-NEXT: str q1, [x0]
+; CHECK-ALIGNED-NEXT: ret
+entry:
+ call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 25, i1 false)
+ ret void
+}
+
+define void @move33(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move33:
+; CHECK-ALIGNED: // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT: ldp q1, q0, [x1]
+; CHECK-ALIGNED-NEXT: ldrb w8, [x1, #32]
+; CHECK-ALIGNED-NEXT: strb w8, [x0, #32]
+; CHECK-ALIGNED-NEXT: stp q1, q0, [x0]
+; CHECK-ALIGNED-NEXT: ret
+entry:
+ call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 33, i1 false)
+ ret void
+}
+
+define void @move49(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move49:
+; CHECK-ALIGNED: // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT: ldp q2, q0, [x1, #16]
+; CHECK-ALIGNED-NEXT: ldrb w8, [x1, #48]
+; CHECK-ALIGNED-NEXT: ldr q1, [x1]
+; CHECK-ALIGNED-NEXT: strb w8, [x0, #48]
+; CHECK-ALIGNED-NEXT: stp q2, q0, [x0, #16]
+; CHECK-ALIGNED-NEXT: str q1, [x0]
+; CHECK-ALIGNED-NEXT: ret
+entry:
+ call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 49, i1 false)
+ ret void
+}
+
+define void @move65(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move65:
+; CHECK-ALIGNED: // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT: ldp q0, q1, [x1, #32]
+; CHECK-ALIGNED-NEXT: ldrb w8, [x1, #64]
+; CHECK-ALIGNED-NEXT: ldp q2, q3, [x1]
+; CHECK-ALIGNED-NEXT: strb w8, [x0, #64]
+; CHECK-ALIGNED-NEXT: stp q0, q1, [x0, #32]
+; CHECK-ALIGNED-NEXT: stp q2, q3, [x0]
+; CHECK-ALIGNED-NEXT: ret
+entry:
+ call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 65, i1 false)
+ ret void
+}
>From 902ba020f0b921f65d65a4a6751954b01869469a Mon Sep 17 00:00:00 2001
From: Osama Abdelkader <osama.abdelkader at gmail.com>
Date: Mon, 26 Jan 2026 00:25:10 +0100
Subject: [PATCH 2/2] Update test expectations for memmove overlapping
optimization
Signed-off-by: Osama Abdelkader <osama.abdelkader at gmail.com>
---
.../AMDGPU/memmove-param-combinations.ll | 1617 ++++++-----------
.../CodeGen/AMDGPU/memmove-scalar-load.ll | 22 +-
llvm/test/CodeGen/RISCV/memmove.ll | 344 ++--
...ile-memstores-nooverlapping-load-stores.ll | 6 +-
4 files changed, 830 insertions(+), 1159 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
index 01b7f40f6256f..b59c55136a3ef 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
@@ -27,19 +27,16 @@ define void @memmove_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p0_p0_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30
-; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
-; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3)
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:23
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT: flat_load_dwordx2 v[2:3], v[2:3] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -83,19 +80,16 @@ define void @memmove_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p0_p0_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30
-; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
-; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3)
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:23
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT: flat_load_dwordx2 v[2:3], v[2:3] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -139,21 +133,13 @@ define void @memmove_p0_p0_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add
; CHECK-LABEL: memmove_p0_p0_sz31_align_8_8:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x4
-; CHECK-NEXT: flat_load_dword v8, v[2:3] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
-; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
-; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:24
-; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:16
-; CHECK-NEXT: flat_load_dword v8, v[2:3] offset:20
-; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2)
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:20
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:15
+; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -197,22 +183,13 @@ define void @memmove_p0_p0_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr
; CHECK-LABEL: memmove_p0_p0_sz31_align_16_16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:30
-; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:16
-; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
-; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:16
-; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:20
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:20
-; CHECK-NEXT: flat_load_dword v2, v[2:3] offset:24
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2)
-; CHECK-NEXT: flat_store_dword v[0:1], v2 offset:24
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:15
+; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -256,19 +233,16 @@ define void @memmove_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p0_p1_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16
-; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -312,19 +286,16 @@ define void @memmove_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p0_p1_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16
-; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -368,21 +339,13 @@ define void @memmove_p0_p1_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add
; CHECK-LABEL: memmove_p0_p1_sz31_align_8_8:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x4
-; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:24
-; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:16
-; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:20
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:15
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:15
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:20
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -426,22 +389,13 @@ define void @memmove_p0_p1_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr
; CHECK-LABEL: memmove_p0_p1_sz31_align_16_16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:30
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:16
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:16
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:20
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:20
-; CHECK-NEXT: global_load_dword v2, v[2:3], off offset:24
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:15
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:15
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dword v[0:1], v2 offset:24
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -485,19 +439,15 @@ define void @memmove_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p0_p3_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read_b32 v8, v2 offset:24
-; CHECK-NEXT: ds_read_u8 v9, v2 offset:30
-; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
-; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16
-; CHECK-NEXT: ds_read_b128 v[2:5], v2
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: ds_read_b64 v[7:8], v2 offset:23
+; CHECK-NEXT: ds_read_b128 v[3:6], v2
+; CHECK-NEXT: ds_read_b64 v[9:10], v2 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[7:8] offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[9:10] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -540,19 +490,15 @@ define void @memmove_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p0_p3_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read_b32 v8, v2 offset:24
-; CHECK-NEXT: ds_read_u8 v9, v2 offset:30
-; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
-; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16
-; CHECK-NEXT: ds_read_b128 v[2:5], v2
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: ds_read_b64 v[7:8], v2 offset:23
+; CHECK-NEXT: ds_read_b128 v[3:6], v2
+; CHECK-NEXT: ds_read_b64 v[9:10], v2 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[7:8] offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[9:10] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -595,19 +541,12 @@ define void @memmove_p0_p3_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add
; CHECK-LABEL: memmove_p0_p3_sz31_align_8_8:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read2_b32 v[6:7], v2 offset0:4 offset1:5
-; CHECK-NEXT: ds_read_b32 v8, v2 offset:24
-; CHECK-NEXT: ds_read_u8 v9, v2 offset:30
-; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
-; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:15
+; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -650,19 +589,12 @@ define void @memmove_p0_p3_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr
; CHECK-LABEL: memmove_p0_p3_sz31_align_16_16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read2_b32 v[6:7], v2 offset0:4 offset1:5
-; CHECK-NEXT: ds_read_b32 v8, v2 offset:24
-; CHECK-NEXT: ds_read_u8 v9, v2 offset:30
-; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
-; CHECK-NEXT: ds_read_b128 v[2:5], v2
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:15
+; CHECK-NEXT: ds_read_b128 v[7:10], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -705,19 +637,16 @@ define void @memmove_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p0_p4_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16
-; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -761,19 +690,16 @@ define void @memmove_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p0_p4_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16
-; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -817,21 +743,13 @@ define void @memmove_p0_p4_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add
; CHECK-LABEL: memmove_p0_p4_sz31_align_8_8:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x4
-; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:24
-; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:16
-; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:20
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:15
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:15
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:20
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -875,22 +793,13 @@ define void @memmove_p0_p4_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr
; CHECK-LABEL: memmove_p0_p4_sz31_align_16_16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:30
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:16
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:16
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:20
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:20
-; CHECK-NEXT: global_load_dword v2, v[2:3], off offset:24
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:15
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:15
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dword v[0:1], v2 offset:24
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -938,22 +847,20 @@ define void @memmove_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p0_p5_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_clause 0x7
; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: flat_store_short v[0:1], v11 offset:28
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[7:8] offset:23
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[9:10] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -1007,22 +914,20 @@ define void @memmove_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p0_p5_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_clause 0x7
; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: flat_store_short v[0:1], v11 offset:28
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[7:8] offset:23
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[9:10] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -1076,23 +981,19 @@ define void @memmove_p0_p5_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr add
; CHECK-LABEL: memmove_p0_p5_sz31_align_8_8:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_clause 0x7
; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: flat_store_short v[0:1], v11 offset:28
-; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(4)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:15
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -1146,23 +1047,19 @@ define void @memmove_p0_p5_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr
; CHECK-LABEL: memmove_p0_p5_sz31_align_16_16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_clause 0x7
; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: flat_store_short v[0:1], v11 offset:28
-; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(4)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:15
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -1211,19 +1108,16 @@ define void @memmove_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p1_p0_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30
-; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
-; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:23
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT: flat_load_dwordx2 v[2:3], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off offset:16
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -1264,19 +1158,16 @@ define void @memmove_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p1_p0_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30
-; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
-; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:23
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT: flat_load_dwordx2 v[2:3], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off offset:16
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -1317,21 +1208,13 @@ define void @memmove_p1_p0_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add
; CHECK-LABEL: memmove_p1_p0_sz31_align_8_8:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x4
-; CHECK-NEXT: flat_load_dword v8, v[2:3] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
-; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
-; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:24
-; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:16
-; CHECK-NEXT: flat_load_dword v8, v[2:3] offset:20
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:15
+; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3]
; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT: global_store_byte v[0:1], v11, off offset:30
-; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:15
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:20
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -1372,22 +1255,13 @@ define void @memmove_p1_p0_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr
; CHECK-LABEL: memmove_p1_p0_sz31_align_16_16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:30
-; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:16
-; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
-; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:16
-; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:20
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:20
-; CHECK-NEXT: flat_load_dword v2, v[2:3] offset:24
-; CHECK-NEXT: global_store_byte v[0:1], v8, off offset:30
-; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:15
+; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:15
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_store_dword v[0:1], v2, off offset:24
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -1428,19 +1302,16 @@ define void @memmove_p1_p1_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p1_p1_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16
-; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off offset:16
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -1481,19 +1352,16 @@ define void @memmove_p1_p1_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p1_p1_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16
-; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off offset:16
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -1534,21 +1402,13 @@ define void @memmove_p1_p1_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add
; CHECK-LABEL: memmove_p1_p1_sz31_align_8_8:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x4
-; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:24
-; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:16
-; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:20
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:15
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: global_store_byte v[0:1], v11, off offset:30
-; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:15
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:20
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -1589,22 +1449,13 @@ define void @memmove_p1_p1_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr
; CHECK-LABEL: memmove_p1_p1_sz31_align_16_16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:30
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:16
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:16
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:20
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:20
-; CHECK-NEXT: global_load_dword v2, v[2:3], off offset:24
-; CHECK-NEXT: global_store_byte v[0:1], v8, off offset:30
-; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:15
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:15
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dword v[0:1], v2, off offset:24
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -1647,19 +1498,13 @@ define void @memmove_p1_p3_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_read_b64 v[7:8], v2
; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:8
-; CHECK-NEXT: ds_read_b32 v9, v2 offset:24
-; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
-; CHECK-NEXT: ds_read_u8 v2, v2 offset:30
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8
+; CHECK-NEXT: ds_read_b64 v[9:10], v2 offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
-; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:24
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
-; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_byte v[0:1], v2, off offset:30
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[9:10], off offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -1701,19 +1546,13 @@ define void @memmove_p1_p3_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_read_b64 v[7:8], v2
; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:8
-; CHECK-NEXT: ds_read_b32 v9, v2 offset:24
-; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
-; CHECK-NEXT: ds_read_u8 v2, v2 offset:30
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8
+; CHECK-NEXT: ds_read_b64 v[9:10], v2 offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
-; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:24
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
-; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_byte v[0:1], v2, off offset:30
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[9:10], off offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -1754,17 +1593,11 @@ define void @memmove_p1_p3_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset1:1
-; CHECK-NEXT: ds_read_b32 v7, v2 offset:16
-; CHECK-NEXT: ds_read_u8 v10, v2 offset:30
-; CHECK-NEXT: ds_read2_b32 v[8:9], v2 offset0:5 offset1:6
-; CHECK-NEXT: ds_read_u16 v2, v2 offset:28
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: ds_read_b128 v[7:10], v2 offset:15
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
-; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_short v[0:1], v2, off offset:28
-; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:15
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -1805,17 +1638,11 @@ define void @memmove_p1_p3_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_read_b128 v[3:6], v2
-; CHECK-NEXT: ds_read_b32 v7, v2 offset:16
-; CHECK-NEXT: ds_read_u8 v10, v2 offset:30
-; CHECK-NEXT: ds_read2_b32 v[8:9], v2 offset0:5 offset1:6
-; CHECK-NEXT: ds_read_u16 v2, v2 offset:28
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: ds_read_b128 v[7:10], v2 offset:15
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
-; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_short v[0:1], v2, off offset:28
-; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:15
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -1855,19 +1682,16 @@ define void @memmove_p1_p4_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p1_p4_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16
-; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off offset:16
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -1908,19 +1732,16 @@ define void @memmove_p1_p4_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p1_p4_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16
-; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off offset:16
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -1961,21 +1782,13 @@ define void @memmove_p1_p4_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add
; CHECK-LABEL: memmove_p1_p4_sz31_align_8_8:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x4
-; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:24
-; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:16
-; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:20
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:15
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: global_store_byte v[0:1], v11, off offset:30
-; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:15
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:20
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -2016,22 +1829,13 @@ define void @memmove_p1_p4_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr
; CHECK-LABEL: memmove_p1_p4_sz31_align_16_16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:30
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:16
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:16
-; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:20
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:20
-; CHECK-NEXT: global_load_dword v2, v[2:3], off offset:24
-; CHECK-NEXT: global_store_byte v[0:1], v8, off offset:30
-; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:15
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:15
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dword v[0:1], v2, off offset:24
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -2076,24 +1880,21 @@ define void @memmove_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p1_p5_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_clause 0x7
; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: global_store_dword v[0:1], v10, off offset:24
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28
-; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(6)
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off
-; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[9:10], off offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -2144,24 +1945,21 @@ define void @memmove_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p1_p5_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_clause 0x7
; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: global_store_dword v[0:1], v10, off offset:24
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28
-; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(6)
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off
-; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[9:10], off offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -2212,22 +2010,19 @@ define void @memmove_p1_p5_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr add
; CHECK-LABEL: memmove_p1_p5_sz31_align_8_8:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_clause 0x7
; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28
-; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(4)
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:15
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -2278,22 +2073,19 @@ define void @memmove_p1_p5_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr
; CHECK-LABEL: memmove_p1_p5_sz31_align_16_16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_clause 0x7
; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28
-; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(4)
; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:15
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -2341,19 +2133,15 @@ define void @memmove_p3_p0_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p3_p0_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30
-; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28
-; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: flat_load_dwordx2 v[5:6], v[1:2] offset:23
+; CHECK-NEXT: flat_load_dwordx2 v[7:8], v[1:2] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT: ds_write_b8 v0, v8 offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3)
-; CHECK-NEXT: ds_write_b16 v0, v9 offset:28
-; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3)
-; CHECK-NEXT: ds_write_b32 v0, v7 offset:24
-; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(4)
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2)
+; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2)
; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -2398,19 +2186,15 @@ define void @memmove_p3_p0_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p3_p0_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30
-; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28
-; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: flat_load_dwordx2 v[5:6], v[1:2] offset:23
+; CHECK-NEXT: flat_load_dwordx2 v[7:8], v[1:2] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT: ds_write_b8 v0, v8 offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3)
-; CHECK-NEXT: ds_write_b16 v0, v9 offset:28
-; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3)
-; CHECK-NEXT: ds_write_b32 v0, v7 offset:24
-; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(4)
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2)
+; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2)
; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -2455,20 +2239,13 @@ define void @memmove_p3_p0_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr add
; CHECK-LABEL: memmove_p3_p0_sz31_align_8_8:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: s_clause 0x1
; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2]
-; CHECK-NEXT: flat_load_dwordx3 v[7:9], v[1:2] offset:16
-; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:30
-; CHECK-NEXT: flat_load_ushort v1, v[1:2] offset:28
-; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
-; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3)
-; CHECK-NEXT: ds_write2_b32 v0, v7, v8 offset0:4 offset1:5
-; CHECK-NEXT: ds_write_b32 v0, v9 offset:24
-; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(4)
-; CHECK-NEXT: ds_write_b8 v0, v10 offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(4)
-; CHECK-NEXT: ds_write_b16 v0, v1 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT: ds_write_b128 v0, v[7:10] offset:15
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -2512,22 +2289,13 @@ define void @memmove_p3_p0_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr
; CHECK-LABEL: memmove_p3_p0_sz31_align_16_16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:30
-; CHECK-NEXT: flat_load_dword v8, v[1:2] offset:16
-; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28
-; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2]
-; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: ds_write_b32 v0, v8 offset:16
-; CHECK-NEXT: flat_load_dword v8, v[1:2] offset:20
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_write_b32 v0, v8 offset:20
-; CHECK-NEXT: flat_load_dword v1, v[1:2] offset:24
-; CHECK-NEXT: ds_write_b8 v0, v7 offset:30
-; CHECK-NEXT: ds_write_b16 v0, v9 offset:28
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2)
-; CHECK-NEXT: ds_write_b32 v0, v1 offset:24
-; CHECK-NEXT: ds_write_b128 v0, v[3:6]
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:15
+; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: ds_write_b128 v0, v[3:6] offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT: ds_write_b128 v0, v[7:10]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -2571,20 +2339,16 @@ define void @memmove_p3_p1_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p3_p1_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: s_clause 0x2
; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
-; CHECK-NEXT: global_load_dwordx3 v[7:9], v[1:2], off offset:16
-; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:28
-; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16
-; CHECK-NEXT: ds_write_b32 v0, v9 offset:24
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -2628,20 +2392,16 @@ define void @memmove_p3_p1_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p3_p1_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: s_clause 0x2
; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
-; CHECK-NEXT: global_load_dwordx3 v[7:9], v[1:2], off offset:16
-; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:28
-; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16
-; CHECK-NEXT: ds_write_b32 v0, v9 offset:24
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -2685,20 +2445,13 @@ define void @memmove_p3_p1_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr add
; CHECK-LABEL: memmove_p3_p1_sz31_align_8_8:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: s_clause 0x1
; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
-; CHECK-NEXT: global_load_dwordx3 v[7:9], v[1:2], off offset:16
-; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:28
-; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
-; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: ds_write2_b32 v0, v8, v9 offset0:5 offset1:6
-; CHECK-NEXT: ds_write_b32 v0, v7 offset:16
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:15
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT: ds_write_b128 v0, v[7:10] offset:15
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -2742,20 +2495,13 @@ define void @memmove_p3_p1_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr
; CHECK-LABEL: memmove_p3_p1_sz31_align_16_16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_dwordx3 v[7:9], v[1:2], off offset:16
+; CHECK-NEXT: s_clause 0x1
; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
-; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:28
-; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: ds_write2_b32 v0, v8, v9 offset0:5 offset1:6
-; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: ds_write_b128 v0, v[3:6]
-; CHECK-NEXT: ds_write_b32 v0, v7 offset:16
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:15
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT: ds_write_b128 v0, v[3:6]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT: ds_write_b128 v0, v[7:10] offset:15
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -2799,20 +2545,14 @@ define void @memmove_p3_p3_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p3_p3_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read_u8 v7, v1 offset:30
-; CHECK-NEXT: ds_read_u16 v8, v1 offset:28
-; CHECK-NEXT: ds_read_b32 v9, v1 offset:24
-; CHECK-NEXT: ds_read_b64 v[5:6], v1 offset:16
+; CHECK-NEXT: ds_read_b64 v[5:6], v1 offset:23
+; CHECK-NEXT: ds_read_b64 v[7:8], v1 offset:16
; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: ds_write_b8 v0, v7 offset:30
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: ds_write_b16 v0, v8 offset:28
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: ds_write_b32 v0, v9 offset:24
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -2856,20 +2596,14 @@ define void @memmove_p3_p3_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p3_p3_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read_u8 v7, v1 offset:30
-; CHECK-NEXT: ds_read_u16 v8, v1 offset:28
-; CHECK-NEXT: ds_read_b32 v9, v1 offset:24
-; CHECK-NEXT: ds_read_b64 v[5:6], v1 offset:16
+; CHECK-NEXT: ds_read_b64 v[5:6], v1 offset:23
+; CHECK-NEXT: ds_read_b64 v[7:8], v1 offset:16
; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: ds_write_b8 v0, v7 offset:30
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: ds_write_b16 v0, v8 offset:28
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: ds_write_b32 v0, v9 offset:24
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -2914,20 +2648,11 @@ define void @memmove_p3_p3_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr add
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1
-; CHECK-NEXT: ds_read2_b32 v[6:7], v1 offset0:4 offset1:5
-; CHECK-NEXT: ds_read_b32 v8, v1 offset:24
-; CHECK-NEXT: ds_read_u8 v9, v1 offset:30
-; CHECK-NEXT: ds_read_u16 v1, v1 offset:28
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: ds_read_b128 v[6:9], v1 offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: ds_write2_b32 v0, v6, v7 offset0:4 offset1:5
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: ds_write_b32 v0, v8 offset:24
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: ds_write_b8 v0, v9 offset:30
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: ds_write_b16 v0, v1 offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: ds_write_b128 v0, v[6:9] offset:15
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -2970,21 +2695,12 @@ define void @memmove_p3_p3_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr
; CHECK-LABEL: memmove_p3_p3_sz31_align_16_16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read2_b32 v[5:6], v1 offset0:4 offset1:5
-; CHECK-NEXT: ds_read_b32 v7, v1 offset:24
-; CHECK-NEXT: ds_read_u8 v8, v1 offset:30
-; CHECK-NEXT: ds_read_u16 v9, v1 offset:28
-; CHECK-NEXT: ds_read_b128 v[1:4], v1
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: ds_write2_b32 v0, v5, v6 offset0:4 offset1:5
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: ds_write_b32 v0, v7 offset:24
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: ds_write_b8 v0, v8 offset:30
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: ds_write_b16 v0, v9 offset:28
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: ds_write_b128 v0, v[1:4]
+; CHECK-NEXT: ds_read_b128 v[2:5], v1 offset:15
+; CHECK-NEXT: ds_read_b128 v[6:9], v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: ds_write_b128 v0, v[2:5] offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: ds_write_b128 v0, v[6:9]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -3027,20 +2743,16 @@ define void @memmove_p3_p4_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p3_p4_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: s_clause 0x2
; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
-; CHECK-NEXT: global_load_dwordx3 v[7:9], v[1:2], off offset:16
-; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:28
-; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16
-; CHECK-NEXT: ds_write_b32 v0, v9 offset:24
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -3084,20 +2796,16 @@ define void @memmove_p3_p4_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p3_p4_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: s_clause 0x2
; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
-; CHECK-NEXT: global_load_dwordx3 v[7:9], v[1:2], off offset:16
-; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:28
-; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16
-; CHECK-NEXT: ds_write_b32 v0, v9 offset:24
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -3141,20 +2849,13 @@ define void @memmove_p3_p4_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr add
; CHECK-LABEL: memmove_p3_p4_sz31_align_8_8:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: s_clause 0x1
; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
-; CHECK-NEXT: global_load_dwordx3 v[7:9], v[1:2], off offset:16
-; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:28
-; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
-; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: ds_write2_b32 v0, v8, v9 offset0:5 offset1:6
-; CHECK-NEXT: ds_write_b32 v0, v7 offset:16
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:15
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT: ds_write_b128 v0, v[7:10] offset:15
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -3198,20 +2899,13 @@ define void @memmove_p3_p4_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr
; CHECK-LABEL: memmove_p3_p4_sz31_align_16_16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_dwordx3 v[7:9], v[1:2], off offset:16
+; CHECK-NEXT: s_clause 0x1
; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
-; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:28
-; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: ds_write2_b32 v0, v8, v9 offset0:5 offset1:6
-; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: ds_write_b128 v0, v[3:6]
-; CHECK-NEXT: ds_write_b32 v0, v7 offset:16
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:15
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT: ds_write_b128 v0, v[3:6]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT: ds_write_b128 v0, v[7:10] offset:15
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -3259,25 +2953,21 @@ define void @memmove_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p3_p5_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_clause 0x7
; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:27
; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: ds_write_b32 v0, v9 offset:24
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: ds_write_b16 v0, v10 offset:28
-; CHECK-NEXT: ds_write_b8 v0, v8 offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
-; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: ds_write_b64 v0, v[6:7] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b64 v0, v[8:9] offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -3331,25 +3021,21 @@ define void @memmove_p3_p5_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p3_p5_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_clause 0x7
; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:27
; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: ds_write_b32 v0, v9 offset:24
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: ds_write_b16 v0, v10 offset:28
-; CHECK-NEXT: ds_write_b8 v0, v8 offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
-; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: ds_write_b64 v0, v[6:7] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b64 v0, v[8:9] offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -3403,26 +3089,19 @@ define void @memmove_p3_p5_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr add
; CHECK-LABEL: memmove_p3_p5_sz31_align_8_8:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: ds_write2_b32 v0, v6, v7 offset0:5 offset1:6
-; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: ds_write_b32 v0, v8 offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: ds_write_b16 v0, v9 offset:28
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset1:1
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT: ds_write_b128 v0, v[2:5] offset:15
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -3476,24 +3155,19 @@ define void @memmove_p3_p5_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr
; CHECK-LABEL: memmove_p3_p5_sz31_align_16_16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_clause 0x7
; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:24
; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:27
; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: ds_write2_b32 v0, v8, v9 offset0:5 offset1:6
-; CHECK-NEXT: ds_write_b32 v0, v7 offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: ds_write_b16 v0, v10 offset:28
-; CHECK-NEXT: ds_write_b8 v0, v6 offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ds_write_b128 v0, v[2:5]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b128 v0, v[6:9] offset:15
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -3545,19 +3219,16 @@ define void @memmove_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p5_p0_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30
-; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28
-; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: flat_load_dwordx2 v[5:6], v[1:2] offset:23
+; CHECK-NEXT: flat_load_dwordx2 v[7:8], v[1:2] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
@@ -3612,19 +3283,16 @@ define void @memmove_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p5_p0_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30
-; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28
-; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: flat_load_dwordx2 v[5:6], v[1:2] offset:23
+; CHECK-NEXT: flat_load_dwordx2 v[7:8], v[1:2] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
@@ -3679,28 +3347,19 @@ define void @memmove_p5_p0_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr add
; CHECK-LABEL: memmove_p5_p0_sz31_align_8_8:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x4
-; CHECK-NEXT: flat_load_dword v7, v[1:2] offset:16
-; CHECK-NEXT: flat_load_dword v8, v[1:2] offset:24
-; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:30
-; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:28
-; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2]
-; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT: flat_load_dword v1, v[1:2] offset:20
-; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:15
+; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2]
; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -3750,25 +3409,19 @@ define void @memmove_p5_p0_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr
; CHECK-LABEL: memmove_p5_p0_sz31_align_16_16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:30
-; CHECK-NEXT: flat_load_dword v8, v[1:2] offset:16
-; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28
-; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2]
-; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT: flat_load_dword v8, v[1:2] offset:20
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: flat_load_dword v1, v[1:2] offset:24
-; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:15
+; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -3818,24 +3471,21 @@ define void @memmove_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p5_p1_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -3885,24 +3535,21 @@ define void @memmove_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p5_p1_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -3952,24 +3599,19 @@ define void @memmove_p5_p1_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr add
; CHECK-LABEL: memmove_p5_p1_sz31_align_8_8:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:15
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -4019,24 +3661,19 @@ define void @memmove_p5_p1_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr
; CHECK-LABEL: memmove_p5_p1_sz31_align_16_16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:15
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -4086,25 +3723,20 @@ define void @memmove_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p5_p3_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read_b32 v8, v1 offset:24
-; CHECK-NEXT: ds_read_u16 v9, v1 offset:28
-; CHECK-NEXT: ds_read_u8 v10, v1 offset:30
; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1
; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: ds_read_b64 v[8:9], v1 offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
-; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -4153,25 +3785,20 @@ define void @memmove_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p5_p3_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read_b32 v8, v1 offset:24
-; CHECK-NEXT: ds_read_u16 v9, v1 offset:28
-; CHECK-NEXT: ds_read_u8 v10, v1 offset:30
; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1
; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: ds_read_b64 v[8:9], v1 offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
-; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -4220,25 +3847,18 @@ define void @memmove_p5_p3_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr add
; CHECK-LABEL: memmove_p5_p3_sz31_align_8_8:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read2_b32 v[5:6], v1 offset0:4 offset1:5
-; CHECK-NEXT: ds_read_b32 v7, v1 offset:24
-; CHECK-NEXT: ds_read_u16 v8, v1 offset:28
-; CHECK-NEXT: ds_read_u8 v9, v1 offset:30
-; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT: s_waitcnt lgkmcnt(2)
-; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT: ds_read_b128 v[6:9], v1 offset:15
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
-; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -4287,25 +3907,18 @@ define void @memmove_p5_p3_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr
; CHECK-LABEL: memmove_p5_p3_sz31_align_16_16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read2_b32 v[5:6], v1 offset0:4 offset1:5
-; CHECK-NEXT: ds_read_b32 v7, v1 offset:24
-; CHECK-NEXT: ds_read_u16 v8, v1 offset:28
-; CHECK-NEXT: ds_read_u8 v9, v1 offset:30
-; CHECK-NEXT: ds_read_b128 v[1:4], v1
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT: s_waitcnt lgkmcnt(2)
-; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: ds_read_b128 v[2:5], v1
+; CHECK-NEXT: ds_read_b128 v[6:9], v1 offset:15
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
-; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -4354,24 +3967,21 @@ define void @memmove_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p5_p4_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -4421,24 +4031,21 @@ define void @memmove_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p5_p4_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -4488,24 +4095,19 @@ define void @memmove_p5_p4_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr add
; CHECK-LABEL: memmove_p5_p4_sz31_align_8_8:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:15
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -4555,24 +4157,19 @@ define void @memmove_p5_p4_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr
; CHECK-LABEL: memmove_p5_p4_sz31_align_16_16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:15
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -4629,34 +4226,31 @@ define void @memmove_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p5_p5_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:27
; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(8)
-; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT: s_waitcnt vmcnt(7)
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_waitcnt vmcnt(6)
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -4725,34 +4319,31 @@ define void @memmove_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p5_p5_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:27
; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(8)
-; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT: s_waitcnt vmcnt(7)
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_waitcnt vmcnt(6)
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -4821,32 +4412,29 @@ define void @memmove_p5_p5_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr add
; CHECK-LABEL: memmove_p5_p5_sz31_align_8_8:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT: s_waitcnt vmcnt(8)
-; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: s_waitcnt vmcnt(7)
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT: s_waitcnt vmcnt(6)
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:19
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -4917,32 +4505,29 @@ define void @memmove_p5_p5_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr
; CHECK-LABEL: memmove_p5_p5_sz31_align_16_16:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT: s_waitcnt vmcnt(8)
-; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24
; CHECK-NEXT: s_waitcnt vmcnt(7)
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:15
; CHECK-NEXT: s_waitcnt vmcnt(6)
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:19
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll b/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll
index 8fdecfac10927..0ded4604d4179 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll
@@ -27,22 +27,16 @@ define void @memmove_p1_p4_sz31_align_4_4(ptr addrspace(1) align 4 %dst, ptr add
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v2, 0
-; CHECK-NEXT: s_load_dwordx8 s[4:11], s[16:17], 0x0
-; CHECK-NEXT: global_load_ubyte v9, v2, s[16:17] offset:30
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[16:17], 0x0
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v2, s[16:17] offset:15
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v2, s4
-; CHECK-NEXT: v_mov_b32_e32 v3, s5
-; CHECK-NEXT: v_mov_b32_e32 v4, s6
-; CHECK-NEXT: v_mov_b32_e32 v5, s7
-; CHECK-NEXT: v_mov_b32_e32 v10, s11
-; CHECK-NEXT: v_mov_b32_e32 v6, s8
-; CHECK-NEXT: v_mov_b32_e32 v7, s9
-; CHECK-NEXT: v_mov_b32_e32 v8, s10
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: v_mov_b32_e32 v9, s7
+; CHECK-NEXT: v_mov_b32_e32 v8, s6
+; CHECK-NEXT: v_mov_b32_e32 v7, s5
+; CHECK-NEXT: v_mov_b32_e32 v6, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30
-; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:15
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 4 %dst, ptr addrspace(4) noundef nonnull align 4 %src, i64 31, i1 false)
diff --git a/llvm/test/CodeGen/RISCV/memmove.ll b/llvm/test/CodeGen/RISCV/memmove.ll
index 1fffe359389b0..89c12c4fa9612 100644
--- a/llvm/test/CodeGen/RISCV/memmove.ll
+++ b/llvm/test/CodeGen/RISCV/memmove.ll
@@ -195,22 +195,18 @@ define void @unaligned_memmove7(ptr nocapture %dest, ptr %src) nounwind {
;
; RV32-FAST-LABEL: unaligned_memmove7:
; RV32-FAST: # %bb.0: # %entry
-; RV32-FAST-NEXT: lw a2, 0(a1)
-; RV32-FAST-NEXT: lh a3, 4(a1)
-; RV32-FAST-NEXT: lbu a1, 6(a1)
-; RV32-FAST-NEXT: sw a2, 0(a0)
-; RV32-FAST-NEXT: sh a3, 4(a0)
-; RV32-FAST-NEXT: sb a1, 6(a0)
+; RV32-FAST-NEXT: lw a2, 3(a1)
+; RV32-FAST-NEXT: lw a1, 0(a1)
+; RV32-FAST-NEXT: sw a2, 3(a0)
+; RV32-FAST-NEXT: sw a1, 0(a0)
; RV32-FAST-NEXT: ret
;
; RV64-FAST-LABEL: unaligned_memmove7:
; RV64-FAST: # %bb.0: # %entry
-; RV64-FAST-NEXT: lw a2, 0(a1)
-; RV64-FAST-NEXT: lh a3, 4(a1)
-; RV64-FAST-NEXT: lbu a1, 6(a1)
-; RV64-FAST-NEXT: sw a2, 0(a0)
-; RV64-FAST-NEXT: sh a3, 4(a0)
-; RV64-FAST-NEXT: sb a1, 6(a0)
+; RV64-FAST-NEXT: lw a2, 3(a1)
+; RV64-FAST-NEXT: lw a1, 0(a1)
+; RV64-FAST-NEXT: sw a2, 3(a0)
+; RV64-FAST-NEXT: sw a1, 0(a0)
; RV64-FAST-NEXT: ret
entry:
tail call void @llvm.memmove.p0.p0.i64(ptr %dest, ptr %src, i64 7, i1 false)
@@ -289,28 +285,22 @@ define void @unaligned_memmove15(ptr nocapture %dest, ptr %src) nounwind {
;
; RV32-FAST-LABEL: unaligned_memmove15:
; RV32-FAST: # %bb.0: # %entry
-; RV32-FAST-NEXT: lbu a2, 14(a1)
+; RV32-FAST-NEXT: lw a2, 11(a1)
; RV32-FAST-NEXT: lw a3, 0(a1)
; RV32-FAST-NEXT: lw a4, 4(a1)
-; RV32-FAST-NEXT: lw a5, 8(a1)
-; RV32-FAST-NEXT: lh a1, 12(a1)
-; RV32-FAST-NEXT: sb a2, 14(a0)
+; RV32-FAST-NEXT: lw a1, 8(a1)
+; RV32-FAST-NEXT: sw a2, 11(a0)
; RV32-FAST-NEXT: sw a3, 0(a0)
; RV32-FAST-NEXT: sw a4, 4(a0)
-; RV32-FAST-NEXT: sw a5, 8(a0)
-; RV32-FAST-NEXT: sh a1, 12(a0)
+; RV32-FAST-NEXT: sw a1, 8(a0)
; RV32-FAST-NEXT: ret
;
; RV64-FAST-LABEL: unaligned_memmove15:
; RV64-FAST: # %bb.0: # %entry
-; RV64-FAST-NEXT: ld a2, 0(a1)
-; RV64-FAST-NEXT: lw a3, 8(a1)
-; RV64-FAST-NEXT: lh a4, 12(a1)
-; RV64-FAST-NEXT: lbu a1, 14(a1)
-; RV64-FAST-NEXT: sd a2, 0(a0)
-; RV64-FAST-NEXT: sw a3, 8(a0)
-; RV64-FAST-NEXT: sh a4, 12(a0)
-; RV64-FAST-NEXT: sb a1, 14(a0)
+; RV64-FAST-NEXT: ld a2, 7(a1)
+; RV64-FAST-NEXT: ld a1, 0(a1)
+; RV64-FAST-NEXT: sd a2, 7(a0)
+; RV64-FAST-NEXT: sd a1, 0(a0)
; RV64-FAST-NEXT: ret
entry:
tail call void @llvm.memmove.p0.p0.i64(ptr %dest, ptr %src, i64 15, i1 false)
@@ -353,30 +343,46 @@ entry:
}
define void @unaligned_memmove31(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-BOTH-LABEL: unaligned_memmove31:
-; RV32-BOTH: # %bb.0: # %entry
-; RV32-BOTH-NEXT: li a2, 31
-; RV32-BOTH-NEXT: tail memmove
+; RV32-LABEL: unaligned_memmove31:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: li a2, 31
+; RV32-NEXT: tail memmove
;
; RV64-LABEL: unaligned_memmove31:
; RV64: # %bb.0: # %entry
; RV64-NEXT: li a2, 31
; RV64-NEXT: tail memmove
;
+; RV32-FAST-LABEL: unaligned_memmove31:
+; RV32-FAST: # %bb.0: # %entry
+; RV32-FAST-NEXT: lw a2, 16(a1)
+; RV32-FAST-NEXT: lw a3, 20(a1)
+; RV32-FAST-NEXT: lw a4, 24(a1)
+; RV32-FAST-NEXT: lw a5, 27(a1)
+; RV32-FAST-NEXT: lw a6, 0(a1)
+; RV32-FAST-NEXT: lw a7, 4(a1)
+; RV32-FAST-NEXT: lw t0, 8(a1)
+; RV32-FAST-NEXT: lw a1, 12(a1)
+; RV32-FAST-NEXT: sw a5, 27(a0)
+; RV32-FAST-NEXT: sw a2, 16(a0)
+; RV32-FAST-NEXT: sw a3, 20(a0)
+; RV32-FAST-NEXT: sw a4, 24(a0)
+; RV32-FAST-NEXT: sw a6, 0(a0)
+; RV32-FAST-NEXT: sw a7, 4(a0)
+; RV32-FAST-NEXT: sw t0, 8(a0)
+; RV32-FAST-NEXT: sw a1, 12(a0)
+; RV32-FAST-NEXT: ret
+;
; RV64-FAST-LABEL: unaligned_memmove31:
; RV64-FAST: # %bb.0: # %entry
-; RV64-FAST-NEXT: lh a2, 28(a1)
-; RV64-FAST-NEXT: lbu a3, 30(a1)
-; RV64-FAST-NEXT: ld a4, 0(a1)
-; RV64-FAST-NEXT: ld a5, 8(a1)
-; RV64-FAST-NEXT: ld a6, 16(a1)
-; RV64-FAST-NEXT: lw a1, 24(a1)
-; RV64-FAST-NEXT: sh a2, 28(a0)
-; RV64-FAST-NEXT: sb a3, 30(a0)
-; RV64-FAST-NEXT: sd a4, 0(a0)
-; RV64-FAST-NEXT: sd a5, 8(a0)
-; RV64-FAST-NEXT: sd a6, 16(a0)
-; RV64-FAST-NEXT: sw a1, 24(a0)
+; RV64-FAST-NEXT: ld a2, 23(a1)
+; RV64-FAST-NEXT: ld a3, 0(a1)
+; RV64-FAST-NEXT: ld a4, 8(a1)
+; RV64-FAST-NEXT: ld a1, 16(a1)
+; RV64-FAST-NEXT: sd a2, 23(a0)
+; RV64-FAST-NEXT: sd a3, 0(a0)
+; RV64-FAST-NEXT: sd a4, 8(a0)
+; RV64-FAST-NEXT: sd a1, 16(a0)
; RV64-FAST-NEXT: ret
entry:
tail call void @llvm.memmove.p0.p0.i64(ptr %dest, ptr %src, i64 31, i1 false)
@@ -472,25 +478,41 @@ entry:
}
define void @aligned_memmove7(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-BOTH-LABEL: aligned_memmove7:
-; RV32-BOTH: # %bb.0: # %entry
-; RV32-BOTH-NEXT: lw a2, 0(a1)
-; RV32-BOTH-NEXT: lh a3, 4(a1)
-; RV32-BOTH-NEXT: lbu a1, 6(a1)
-; RV32-BOTH-NEXT: sw a2, 0(a0)
-; RV32-BOTH-NEXT: sh a3, 4(a0)
-; RV32-BOTH-NEXT: sb a1, 6(a0)
-; RV32-BOTH-NEXT: ret
+; RV32-LABEL: aligned_memmove7:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: lw a2, 0(a1)
+; RV32-NEXT: lh a3, 4(a1)
+; RV32-NEXT: lbu a1, 6(a1)
+; RV32-NEXT: sw a2, 0(a0)
+; RV32-NEXT: sh a3, 4(a0)
+; RV32-NEXT: sb a1, 6(a0)
+; RV32-NEXT: ret
;
-; RV64-BOTH-LABEL: aligned_memmove7:
-; RV64-BOTH: # %bb.0: # %entry
-; RV64-BOTH-NEXT: lw a2, 0(a1)
-; RV64-BOTH-NEXT: lh a3, 4(a1)
-; RV64-BOTH-NEXT: lbu a1, 6(a1)
-; RV64-BOTH-NEXT: sw a2, 0(a0)
-; RV64-BOTH-NEXT: sh a3, 4(a0)
-; RV64-BOTH-NEXT: sb a1, 6(a0)
-; RV64-BOTH-NEXT: ret
+; RV64-LABEL: aligned_memmove7:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: lw a2, 0(a1)
+; RV64-NEXT: lh a3, 4(a1)
+; RV64-NEXT: lbu a1, 6(a1)
+; RV64-NEXT: sw a2, 0(a0)
+; RV64-NEXT: sh a3, 4(a0)
+; RV64-NEXT: sb a1, 6(a0)
+; RV64-NEXT: ret
+;
+; RV32-FAST-LABEL: aligned_memmove7:
+; RV32-FAST: # %bb.0: # %entry
+; RV32-FAST-NEXT: lw a2, 3(a1)
+; RV32-FAST-NEXT: lw a1, 0(a1)
+; RV32-FAST-NEXT: sw a2, 3(a0)
+; RV32-FAST-NEXT: sw a1, 0(a0)
+; RV32-FAST-NEXT: ret
+;
+; RV64-FAST-LABEL: aligned_memmove7:
+; RV64-FAST: # %bb.0: # %entry
+; RV64-FAST-NEXT: lw a2, 3(a1)
+; RV64-FAST-NEXT: lw a1, 0(a1)
+; RV64-FAST-NEXT: sw a2, 3(a0)
+; RV64-FAST-NEXT: sw a1, 0(a0)
+; RV64-FAST-NEXT: ret
entry:
tail call void @llvm.memmove.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 7, i1 false)
ret void
@@ -516,31 +538,51 @@ entry:
}
define void @aligned_memmove15(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-BOTH-LABEL: aligned_memmove15:
-; RV32-BOTH: # %bb.0: # %entry
-; RV32-BOTH-NEXT: lw a2, 0(a1)
-; RV32-BOTH-NEXT: lw a3, 8(a1)
-; RV32-BOTH-NEXT: lh a4, 12(a1)
-; RV32-BOTH-NEXT: lbu a5, 14(a1)
-; RV32-BOTH-NEXT: sw a2, 0(a0)
-; RV32-BOTH-NEXT: lw a1, 4(a1)
-; RV32-BOTH-NEXT: sw a1, 4(a0)
-; RV32-BOTH-NEXT: sw a3, 8(a0)
-; RV32-BOTH-NEXT: sh a4, 12(a0)
-; RV32-BOTH-NEXT: sb a5, 14(a0)
-; RV32-BOTH-NEXT: ret
+; RV32-LABEL: aligned_memmove15:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: lw a2, 0(a1)
+; RV32-NEXT: lw a3, 8(a1)
+; RV32-NEXT: lh a4, 12(a1)
+; RV32-NEXT: lbu a5, 14(a1)
+; RV32-NEXT: sw a2, 0(a0)
+; RV32-NEXT: lw a1, 4(a1)
+; RV32-NEXT: sw a1, 4(a0)
+; RV32-NEXT: sw a3, 8(a0)
+; RV32-NEXT: sh a4, 12(a0)
+; RV32-NEXT: sb a5, 14(a0)
+; RV32-NEXT: ret
;
-; RV64-BOTH-LABEL: aligned_memmove15:
-; RV64-BOTH: # %bb.0: # %entry
-; RV64-BOTH-NEXT: ld a2, 0(a1)
-; RV64-BOTH-NEXT: lw a3, 8(a1)
-; RV64-BOTH-NEXT: lh a4, 12(a1)
-; RV64-BOTH-NEXT: lbu a1, 14(a1)
-; RV64-BOTH-NEXT: sd a2, 0(a0)
-; RV64-BOTH-NEXT: sw a3, 8(a0)
-; RV64-BOTH-NEXT: sh a4, 12(a0)
-; RV64-BOTH-NEXT: sb a1, 14(a0)
-; RV64-BOTH-NEXT: ret
+; RV64-LABEL: aligned_memmove15:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: ld a2, 0(a1)
+; RV64-NEXT: lw a3, 8(a1)
+; RV64-NEXT: lh a4, 12(a1)
+; RV64-NEXT: lbu a1, 14(a1)
+; RV64-NEXT: sd a2, 0(a0)
+; RV64-NEXT: sw a3, 8(a0)
+; RV64-NEXT: sh a4, 12(a0)
+; RV64-NEXT: sb a1, 14(a0)
+; RV64-NEXT: ret
+;
+; RV32-FAST-LABEL: aligned_memmove15:
+; RV32-FAST: # %bb.0: # %entry
+; RV32-FAST-NEXT: lw a2, 0(a1)
+; RV32-FAST-NEXT: lw a3, 8(a1)
+; RV32-FAST-NEXT: lw a4, 11(a1)
+; RV32-FAST-NEXT: sw a2, 0(a0)
+; RV32-FAST-NEXT: lw a1, 4(a1)
+; RV32-FAST-NEXT: sw a1, 4(a0)
+; RV32-FAST-NEXT: sw a3, 8(a0)
+; RV32-FAST-NEXT: sw a4, 11(a0)
+; RV32-FAST-NEXT: ret
+;
+; RV64-FAST-LABEL: aligned_memmove15:
+; RV64-FAST: # %bb.0: # %entry
+; RV64-FAST-NEXT: ld a2, 7(a1)
+; RV64-FAST-NEXT: ld a1, 0(a1)
+; RV64-FAST-NEXT: sd a2, 7(a0)
+; RV64-FAST-NEXT: sd a1, 0(a0)
+; RV64-FAST-NEXT: ret
entry:
tail call void @llvm.memmove.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 15, i1 false)
ret void
@@ -572,26 +614,58 @@ entry:
}
define void @aligned_memmove31(ptr nocapture %dest, ptr %src) nounwind {
-; RV32-BOTH-LABEL: aligned_memmove31:
-; RV32-BOTH: # %bb.0: # %entry
-; RV32-BOTH-NEXT: li a2, 31
-; RV32-BOTH-NEXT: tail memmove
+; RV32-LABEL: aligned_memmove31:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: li a2, 31
+; RV32-NEXT: tail memmove
;
-; RV64-BOTH-LABEL: aligned_memmove31:
-; RV64-BOTH: # %bb.0: # %entry
-; RV64-BOTH-NEXT: lh a2, 28(a1)
-; RV64-BOTH-NEXT: lbu a3, 30(a1)
-; RV64-BOTH-NEXT: ld a4, 0(a1)
-; RV64-BOTH-NEXT: ld a5, 8(a1)
-; RV64-BOTH-NEXT: ld a6, 16(a1)
-; RV64-BOTH-NEXT: lw a1, 24(a1)
-; RV64-BOTH-NEXT: sh a2, 28(a0)
-; RV64-BOTH-NEXT: sb a3, 30(a0)
-; RV64-BOTH-NEXT: sd a4, 0(a0)
-; RV64-BOTH-NEXT: sd a5, 8(a0)
-; RV64-BOTH-NEXT: sd a6, 16(a0)
-; RV64-BOTH-NEXT: sw a1, 24(a0)
-; RV64-BOTH-NEXT: ret
+; RV64-LABEL: aligned_memmove31:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: lh a2, 28(a1)
+; RV64-NEXT: lbu a3, 30(a1)
+; RV64-NEXT: ld a4, 0(a1)
+; RV64-NEXT: ld a5, 8(a1)
+; RV64-NEXT: ld a6, 16(a1)
+; RV64-NEXT: lw a1, 24(a1)
+; RV64-NEXT: sh a2, 28(a0)
+; RV64-NEXT: sb a3, 30(a0)
+; RV64-NEXT: sd a4, 0(a0)
+; RV64-NEXT: sd a5, 8(a0)
+; RV64-NEXT: sd a6, 16(a0)
+; RV64-NEXT: sw a1, 24(a0)
+; RV64-NEXT: ret
+;
+; RV32-FAST-LABEL: aligned_memmove31:
+; RV32-FAST: # %bb.0: # %entry
+; RV32-FAST-NEXT: lw a2, 27(a1)
+; RV32-FAST-NEXT: lw a3, 0(a1)
+; RV32-FAST-NEXT: lw a4, 8(a1)
+; RV32-FAST-NEXT: lw a5, 16(a1)
+; RV32-FAST-NEXT: lw a6, 24(a1)
+; RV32-FAST-NEXT: sw a3, 0(a0)
+; RV32-FAST-NEXT: lw a3, 4(a1)
+; RV32-FAST-NEXT: lw a7, 12(a1)
+; RV32-FAST-NEXT: lw a1, 20(a1)
+; RV32-FAST-NEXT: sw a3, 4(a0)
+; RV32-FAST-NEXT: sw a4, 8(a0)
+; RV32-FAST-NEXT: sw a7, 12(a0)
+; RV32-FAST-NEXT: sw a5, 16(a0)
+; RV32-FAST-NEXT: sw a1, 20(a0)
+; RV32-FAST-NEXT: sw a6, 24(a0)
+; RV32-FAST-NEXT: sw a2, 27(a0)
+; RV32-FAST-NEXT: ret
+;
+; RV64-FAST-LABEL: aligned_memmove31:
+; RV64-FAST: # %bb.0: # %entry
+; RV64-FAST-NEXT: ld a2, 23(a1)
+; RV64-FAST-NEXT: ld a3, 0(a1)
+; RV64-FAST-NEXT: ld a4, 8(a1)
+; RV64-FAST-NEXT: ld a1, 16(a1)
+; RV64-FAST-NEXT: sd a2, 23(a0)
+; RV64-FAST-NEXT: sd a3, 0(a0)
+; RV64-FAST-NEXT: sd a4, 8(a0)
+; RV64-FAST-NEXT: sd a1, 16(a0)
+; RV64-FAST-NEXT: ret
entry:
tail call void @llvm.memmove.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 31, i1 false)
ret void
@@ -638,29 +712,49 @@ entry:
}
define i32 @memmove11_align8(ptr nocapture %dest, ptr %src) {
-; RV32-BOTH-LABEL: memmove11_align8:
-; RV32-BOTH: # %bb.0: # %entry
-; RV32-BOTH-NEXT: lw a2, 0(a1)
-; RV32-BOTH-NEXT: lh a3, 8(a1)
-; RV32-BOTH-NEXT: lbu a4, 10(a1)
-; RV32-BOTH-NEXT: sw a2, 0(a0)
-; RV32-BOTH-NEXT: lw a1, 4(a1)
-; RV32-BOTH-NEXT: sw a1, 4(a0)
-; RV32-BOTH-NEXT: sh a3, 8(a0)
-; RV32-BOTH-NEXT: sb a4, 10(a0)
-; RV32-BOTH-NEXT: li a0, 0
-; RV32-BOTH-NEXT: ret
+; RV32-LABEL: memmove11_align8:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: lw a2, 0(a1)
+; RV32-NEXT: lh a3, 8(a1)
+; RV32-NEXT: lbu a4, 10(a1)
+; RV32-NEXT: sw a2, 0(a0)
+; RV32-NEXT: lw a1, 4(a1)
+; RV32-NEXT: sw a1, 4(a0)
+; RV32-NEXT: sh a3, 8(a0)
+; RV32-NEXT: sb a4, 10(a0)
+; RV32-NEXT: li a0, 0
+; RV32-NEXT: ret
;
-; RV64-BOTH-LABEL: memmove11_align8:
-; RV64-BOTH: # %bb.0: # %entry
-; RV64-BOTH-NEXT: ld a2, 0(a1)
-; RV64-BOTH-NEXT: lh a3, 8(a1)
-; RV64-BOTH-NEXT: lbu a1, 10(a1)
-; RV64-BOTH-NEXT: sd a2, 0(a0)
-; RV64-BOTH-NEXT: sh a3, 8(a0)
-; RV64-BOTH-NEXT: sb a1, 10(a0)
-; RV64-BOTH-NEXT: li a0, 0
-; RV64-BOTH-NEXT: ret
+; RV64-LABEL: memmove11_align8:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: ld a2, 0(a1)
+; RV64-NEXT: lh a3, 8(a1)
+; RV64-NEXT: lbu a1, 10(a1)
+; RV64-NEXT: sd a2, 0(a0)
+; RV64-NEXT: sh a3, 8(a0)
+; RV64-NEXT: sb a1, 10(a0)
+; RV64-NEXT: li a0, 0
+; RV64-NEXT: ret
+;
+; RV32-FAST-LABEL: memmove11_align8:
+; RV32-FAST: # %bb.0: # %entry
+; RV32-FAST-NEXT: lw a2, 0(a1)
+; RV32-FAST-NEXT: lw a3, 7(a1)
+; RV32-FAST-NEXT: sw a2, 0(a0)
+; RV32-FAST-NEXT: lw a1, 4(a1)
+; RV32-FAST-NEXT: sw a1, 4(a0)
+; RV32-FAST-NEXT: sw a3, 7(a0)
+; RV32-FAST-NEXT: li a0, 0
+; RV32-FAST-NEXT: ret
+;
+; RV64-FAST-LABEL: memmove11_align8:
+; RV64-FAST: # %bb.0: # %entry
+; RV64-FAST-NEXT: lw a2, 7(a1)
+; RV64-FAST-NEXT: ld a1, 0(a1)
+; RV64-FAST-NEXT: sw a2, 7(a0)
+; RV64-FAST-NEXT: sd a1, 0(a0)
+; RV64-FAST-NEXT: li a0, 0
+; RV64-FAST-NEXT: ret
entry:
call void @llvm.memmove.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 11, i1 false)
ret i32 0
diff --git a/llvm/test/CodeGen/X86/volatile-memstores-nooverlapping-load-stores.ll b/llvm/test/CodeGen/X86/volatile-memstores-nooverlapping-load-stores.ll
index dd61ec629c2f0..bb1a9500a4c94 100644
--- a/llvm/test/CodeGen/X86/volatile-memstores-nooverlapping-load-stores.ll
+++ b/llvm/test/CodeGen/X86/volatile-memstores-nooverlapping-load-stores.ll
@@ -34,10 +34,8 @@ define dso_local void @move_7_bytes(ptr nocapture, ptr nocapture readonly) nounw
; CHECK-LABEL: move_7_bytes:
; CHECK: # %bb.0:
; CHECK-NEXT: movl (%rsi), %eax
-; CHECK-NEXT: movzwl 4(%rsi), %ecx
-; CHECK-NEXT: movzbl 6(%rsi), %edx
-; CHECK-NEXT: movb %dl, 6(%rdi)
-; CHECK-NEXT: movw %cx, 4(%rdi)
+; CHECK-NEXT: movl 3(%rsi), %ecx
+; CHECK-NEXT: movl %ecx, 3(%rdi)
; CHECK-NEXT: movl %eax, (%rdi)
; CHECK-NEXT: retq
tail call void @llvm.memmove.p0.p0.i64(ptr align 1 %0, ptr align 1 %1, i64 7, i1 false)
More information about the llvm-commits
mailing list